| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9365617433414044, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009685230024213076, | |
| "grad_norm": 6.778852939605713, | |
| "learning_rate": 2.9999227754514262e-05, | |
| "loss": 0.8519, | |
| "num_input_tokens_seen": 25568, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01937046004842615, | |
| "grad_norm": 3.0029561519622803, | |
| "learning_rate": 2.9996911097572118e-05, | |
| "loss": 0.189, | |
| "num_input_tokens_seen": 51072, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.029055690072639227, | |
| "grad_norm": 5.477710247039795, | |
| "learning_rate": 2.9993050267710624e-05, | |
| "loss": 0.1648, | |
| "num_input_tokens_seen": 76416, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0387409200968523, | |
| "grad_norm": 4.35634183883667, | |
| "learning_rate": 2.9987645662464235e-05, | |
| "loss": 0.1905, | |
| "num_input_tokens_seen": 101344, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.048426150121065374, | |
| "grad_norm": 4.523565292358398, | |
| "learning_rate": 2.9980697838323884e-05, | |
| "loss": 0.1794, | |
| "num_input_tokens_seen": 126656, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05811138014527845, | |
| "grad_norm": 1.9348187446594238, | |
| "learning_rate": 2.9972207510679677e-05, | |
| "loss": 0.1528, | |
| "num_input_tokens_seen": 151200, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06779661016949153, | |
| "grad_norm": 2.981433629989624, | |
| "learning_rate": 2.996217555374725e-05, | |
| "loss": 0.1742, | |
| "num_input_tokens_seen": 175968, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0774818401937046, | |
| "grad_norm": 3.6294591426849365, | |
| "learning_rate": 2.9950603000477722e-05, | |
| "loss": 0.1565, | |
| "num_input_tokens_seen": 201280, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08716707021791767, | |
| "grad_norm": 2.5459301471710205, | |
| "learning_rate": 2.993749104245137e-05, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 226432, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09685230024213075, | |
| "grad_norm": 2.2721059322357178, | |
| "learning_rate": 2.992284102975491e-05, | |
| "loss": 0.1441, | |
| "num_input_tokens_seen": 251744, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10653753026634383, | |
| "grad_norm": 2.0033624172210693, | |
| "learning_rate": 2.9906654470842492e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 276480, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1162227602905569, | |
| "grad_norm": 8.585118293762207, | |
| "learning_rate": 2.9888933032380397e-05, | |
| "loss": 0.1333, | |
| "num_input_tokens_seen": 301664, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12590799031476999, | |
| "grad_norm": 1.423967719078064, | |
| "learning_rate": 2.9869678539075403e-05, | |
| "loss": 0.1728, | |
| "num_input_tokens_seen": 326784, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.13559322033898305, | |
| "grad_norm": 2.6306211948394775, | |
| "learning_rate": 2.9848892973486912e-05, | |
| "loss": 0.1281, | |
| "num_input_tokens_seen": 351328, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14527845036319612, | |
| "grad_norm": 2.5618090629577637, | |
| "learning_rate": 2.9826578475822825e-05, | |
| "loss": 0.1136, | |
| "num_input_tokens_seen": 376000, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1549636803874092, | |
| "grad_norm": 2.694077730178833, | |
| "learning_rate": 2.980273734371914e-05, | |
| "loss": 0.1277, | |
| "num_input_tokens_seen": 400384, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.16464891041162227, | |
| "grad_norm": 2.632338047027588, | |
| "learning_rate": 2.9777372032003423e-05, | |
| "loss": 0.1028, | |
| "num_input_tokens_seen": 426432, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.17433414043583534, | |
| "grad_norm": 2.3446829319000244, | |
| "learning_rate": 2.975048515244199e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 451712, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.18401937046004843, | |
| "grad_norm": 1.8457319736480713, | |
| "learning_rate": 2.9722079473471035e-05, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 476960, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1937046004842615, | |
| "grad_norm": 1.8676010370254517, | |
| "learning_rate": 2.9692157919911536e-05, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 501440, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2033898305084746, | |
| "grad_norm": 4.593673229217529, | |
| "learning_rate": 2.966072357266811e-05, | |
| "loss": 0.1314, | |
| "num_input_tokens_seen": 526656, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.21307506053268765, | |
| "grad_norm": 3.9568676948547363, | |
| "learning_rate": 2.9627779668411795e-05, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 552544, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.22276029055690072, | |
| "grad_norm": 2.4331846237182617, | |
| "learning_rate": 2.9593329599246766e-05, | |
| "loss": 0.115, | |
| "num_input_tokens_seen": 577472, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2324455205811138, | |
| "grad_norm": 2.525543212890625, | |
| "learning_rate": 2.955737691236108e-05, | |
| "loss": 0.1158, | |
| "num_input_tokens_seen": 601856, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.24213075060532688, | |
| "grad_norm": 2.2355105876922607, | |
| "learning_rate": 2.9519925309661422e-05, | |
| "loss": 0.111, | |
| "num_input_tokens_seen": 627904, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.25181598062953997, | |
| "grad_norm": 4.165389537811279, | |
| "learning_rate": 2.948097864739194e-05, | |
| "loss": 0.1314, | |
| "num_input_tokens_seen": 651936, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.26150121065375304, | |
| "grad_norm": 3.1712851524353027, | |
| "learning_rate": 2.944054093573719e-05, | |
| "loss": 0.143, | |
| "num_input_tokens_seen": 676416, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 2.881716728210449, | |
| "learning_rate": 2.93986163384092e-05, | |
| "loss": 0.1121, | |
| "num_input_tokens_seen": 700832, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.28087167070217917, | |
| "grad_norm": 3.060872793197632, | |
| "learning_rate": 2.9355209172218777e-05, | |
| "loss": 0.1159, | |
| "num_input_tokens_seen": 725824, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.29055690072639223, | |
| "grad_norm": 4.449444770812988, | |
| "learning_rate": 2.931032390663101e-05, | |
| "loss": 0.133, | |
| "num_input_tokens_seen": 749408, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.30024213075060535, | |
| "grad_norm": 5.323568344116211, | |
| "learning_rate": 2.926396516330506e-05, | |
| "loss": 0.1172, | |
| "num_input_tokens_seen": 773984, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3099273607748184, | |
| "grad_norm": 3.144500732421875, | |
| "learning_rate": 2.921613771561829e-05, | |
| "loss": 0.136, | |
| "num_input_tokens_seen": 799168, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3196125907990315, | |
| "grad_norm": 2.433586359024048, | |
| "learning_rate": 2.916684648817478e-05, | |
| "loss": 0.0973, | |
| "num_input_tokens_seen": 824320, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.32929782082324455, | |
| "grad_norm": 3.349472761154175, | |
| "learning_rate": 2.9116096556298256e-05, | |
| "loss": 0.13, | |
| "num_input_tokens_seen": 849632, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 1.8927061557769775, | |
| "learning_rate": 2.9063893145509475e-05, | |
| "loss": 0.1257, | |
| "num_input_tokens_seen": 874400, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3486682808716707, | |
| "grad_norm": 3.972686529159546, | |
| "learning_rate": 2.901024163098822e-05, | |
| "loss": 0.1155, | |
| "num_input_tokens_seen": 899264, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3583535108958838, | |
| "grad_norm": 1.177282452583313, | |
| "learning_rate": 2.8955147537019815e-05, | |
| "loss": 0.1251, | |
| "num_input_tokens_seen": 924544, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.36803874092009686, | |
| "grad_norm": 1.9911576509475708, | |
| "learning_rate": 2.88986165364263e-05, | |
| "loss": 0.1147, | |
| "num_input_tokens_seen": 949792, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.37772397094430993, | |
| "grad_norm": 2.402615785598755, | |
| "learning_rate": 2.8840654449982344e-05, | |
| "loss": 0.1433, | |
| "num_input_tokens_seen": 974112, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.387409200968523, | |
| "grad_norm": 1.3184998035430908, | |
| "learning_rate": 2.8781267245815898e-05, | |
| "loss": 0.1117, | |
| "num_input_tokens_seen": 999168, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.39709443099273606, | |
| "grad_norm": 1.9284625053405762, | |
| "learning_rate": 2.8720461038793672e-05, | |
| "loss": 0.1353, | |
| "num_input_tokens_seen": 1024320, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4067796610169492, | |
| "grad_norm": 3.1020259857177734, | |
| "learning_rate": 2.8658242089891515e-05, | |
| "loss": 0.1165, | |
| "num_input_tokens_seen": 1049088, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.41646489104116224, | |
| "grad_norm": 2.203179359436035, | |
| "learning_rate": 2.8594616805549752e-05, | |
| "loss": 0.1215, | |
| "num_input_tokens_seen": 1073632, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4261501210653753, | |
| "grad_norm": 2.053194522857666, | |
| "learning_rate": 2.8529591737013526e-05, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 1098208, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4358353510895884, | |
| "grad_norm": 2.780935049057007, | |
| "learning_rate": 2.8463173579658258e-05, | |
| "loss": 0.0879, | |
| "num_input_tokens_seen": 1122336, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.44552058111380144, | |
| "grad_norm": 1.9929611682891846, | |
| "learning_rate": 2.8395369172300235e-05, | |
| "loss": 0.1141, | |
| "num_input_tokens_seen": 1147392, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4552058111380145, | |
| "grad_norm": 1.1469779014587402, | |
| "learning_rate": 2.8326185496492464e-05, | |
| "loss": 0.1052, | |
| "num_input_tokens_seen": 1173248, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4648910411622276, | |
| "grad_norm": 2.501117706298828, | |
| "learning_rate": 2.825562967580579e-05, | |
| "loss": 0.1086, | |
| "num_input_tokens_seen": 1197984, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4745762711864407, | |
| "grad_norm": 2.0266308784484863, | |
| "learning_rate": 2.8183708975095406e-05, | |
| "loss": 0.1201, | |
| "num_input_tokens_seen": 1222720, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.48426150121065376, | |
| "grad_norm": 1.1120251417160034, | |
| "learning_rate": 2.8110430799752845e-05, | |
| "loss": 0.1319, | |
| "num_input_tokens_seen": 1247232, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4939467312348668, | |
| "grad_norm": 1.2014496326446533, | |
| "learning_rate": 2.8035802694943457e-05, | |
| "loss": 0.1071, | |
| "num_input_tokens_seen": 1273184, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5036319612590799, | |
| "grad_norm": 1.1245245933532715, | |
| "learning_rate": 2.7959832344829512e-05, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 1298688, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.513317191283293, | |
| "grad_norm": 2.031115770339966, | |
| "learning_rate": 2.7882527571779003e-05, | |
| "loss": 0.1196, | |
| "num_input_tokens_seen": 1324128, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5230024213075061, | |
| "grad_norm": 1.7691289186477661, | |
| "learning_rate": 2.78038963355602e-05, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 1349120, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5326876513317191, | |
| "grad_norm": 2.9496989250183105, | |
| "learning_rate": 2.7723946732522055e-05, | |
| "loss": 0.1109, | |
| "num_input_tokens_seen": 1374304, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 2.2881715297698975, | |
| "learning_rate": 2.764268699476058e-05, | |
| "loss": 0.1274, | |
| "num_input_tokens_seen": 1399136, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5520581113801453, | |
| "grad_norm": 1.9754095077514648, | |
| "learning_rate": 2.756012548927119e-05, | |
| "loss": 0.1397, | |
| "num_input_tokens_seen": 1424672, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5617433414043583, | |
| "grad_norm": 1.9883428812026978, | |
| "learning_rate": 2.7476270717087215e-05, | |
| "loss": 0.101, | |
| "num_input_tokens_seen": 1449024, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.9653130769729614, | |
| "learning_rate": 2.7391131312404556e-05, | |
| "loss": 0.0941, | |
| "num_input_tokens_seen": 1475264, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5811138014527845, | |
| "grad_norm": 4.576601028442383, | |
| "learning_rate": 2.7304716041692663e-05, | |
| "loss": 0.0865, | |
| "num_input_tokens_seen": 1500064, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5907990314769975, | |
| "grad_norm": 2.4046311378479004, | |
| "learning_rate": 2.7217033802791906e-05, | |
| "loss": 0.1596, | |
| "num_input_tokens_seen": 1524448, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6004842615012107, | |
| "grad_norm": 1.7785555124282837, | |
| "learning_rate": 2.7128093623997368e-05, | |
| "loss": 0.0891, | |
| "num_input_tokens_seen": 1549536, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6101694915254238, | |
| "grad_norm": 2.2736170291900635, | |
| "learning_rate": 2.7037904663129262e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 1573408, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6198547215496368, | |
| "grad_norm": 1.0862345695495605, | |
| "learning_rate": 2.6946476206589972e-05, | |
| "loss": 0.1023, | |
| "num_input_tokens_seen": 1597888, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6295399515738499, | |
| "grad_norm": 0.5358290672302246, | |
| "learning_rate": 2.6853817668407875e-05, | |
| "loss": 0.0669, | |
| "num_input_tokens_seen": 1623296, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.639225181598063, | |
| "grad_norm": 2.3138749599456787, | |
| "learning_rate": 2.6759938589268023e-05, | |
| "loss": 0.1017, | |
| "num_input_tokens_seen": 1649216, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.648910411622276, | |
| "grad_norm": 3.2054226398468018, | |
| "learning_rate": 2.6664848635529742e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 1673760, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6585956416464891, | |
| "grad_norm": 1.8352829217910767, | |
| "learning_rate": 2.6568557598231385e-05, | |
| "loss": 0.1081, | |
| "num_input_tokens_seen": 1698592, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6682808716707022, | |
| "grad_norm": 1.203284740447998, | |
| "learning_rate": 2.6471075392082125e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 1723296, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 1.635628581047058, | |
| "learning_rate": 2.6372412054441116e-05, | |
| "loss": 0.1216, | |
| "num_input_tokens_seen": 1748384, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6876513317191283, | |
| "grad_norm": 0.8993457555770874, | |
| "learning_rate": 2.6272577744283965e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 1773600, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6973365617433414, | |
| "grad_norm": 1.7306419610977173, | |
| "learning_rate": 2.617158274115673e-05, | |
| "loss": 0.1034, | |
| "num_input_tokens_seen": 1798656, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7070217917675545, | |
| "grad_norm": 2.770066976547241, | |
| "learning_rate": 2.6069437444117432e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 1824544, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7167070217917676, | |
| "grad_norm": 2.3590221405029297, | |
| "learning_rate": 2.596615237066535e-05, | |
| "loss": 0.1063, | |
| "num_input_tokens_seen": 1848896, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7263922518159807, | |
| "grad_norm": 1.0496519804000854, | |
| "learning_rate": 2.586173815565805e-05, | |
| "loss": 0.1104, | |
| "num_input_tokens_seen": 1873248, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7360774818401937, | |
| "grad_norm": 1.513573408126831, | |
| "learning_rate": 2.575620555021634e-05, | |
| "loss": 0.1125, | |
| "num_input_tokens_seen": 1897184, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.7457627118644068, | |
| "grad_norm": 1.5545728206634521, | |
| "learning_rate": 2.564956542061732e-05, | |
| "loss": 0.0969, | |
| "num_input_tokens_seen": 1922368, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.7554479418886199, | |
| "grad_norm": 1.9260263442993164, | |
| "learning_rate": 2.5541828747175477e-05, | |
| "loss": 0.1142, | |
| "num_input_tokens_seen": 1947904, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7651331719128329, | |
| "grad_norm": 2.396538734436035, | |
| "learning_rate": 2.543300662311211e-05, | |
| "loss": 0.0926, | |
| "num_input_tokens_seen": 1971872, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.774818401937046, | |
| "grad_norm": 1.7069965600967407, | |
| "learning_rate": 2.532311025341309e-05, | |
| "loss": 0.0802, | |
| "num_input_tokens_seen": 1996352, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.784503631961259, | |
| "grad_norm": 5.540910243988037, | |
| "learning_rate": 2.5212150953675133e-05, | |
| "loss": 0.1248, | |
| "num_input_tokens_seen": 2020480, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7941888619854721, | |
| "grad_norm": 1.7795952558517456, | |
| "learning_rate": 2.5100140148940688e-05, | |
| "loss": 0.0767, | |
| "num_input_tokens_seen": 2044448, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8038740920096852, | |
| "grad_norm": 2.7387983798980713, | |
| "learning_rate": 2.498708937252153e-05, | |
| "loss": 0.1239, | |
| "num_input_tokens_seen": 2070400, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 2.1243462562561035, | |
| "learning_rate": 2.4873010264811222e-05, | |
| "loss": 0.108, | |
| "num_input_tokens_seen": 2095392, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8232445520581114, | |
| "grad_norm": 0.9928631782531738, | |
| "learning_rate": 2.4757914572086555e-05, | |
| "loss": 0.0994, | |
| "num_input_tokens_seen": 2120192, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8329297820823245, | |
| "grad_norm": 6.047460556030273, | |
| "learning_rate": 2.464181414529809e-05, | |
| "loss": 0.0927, | |
| "num_input_tokens_seen": 2144384, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8426150121065376, | |
| "grad_norm": 2.2197115421295166, | |
| "learning_rate": 2.4524720938849883e-05, | |
| "loss": 0.1328, | |
| "num_input_tokens_seen": 2168704, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.8523002421307506, | |
| "grad_norm": 2.0752601623535156, | |
| "learning_rate": 2.440664700936861e-05, | |
| "loss": 0.1229, | |
| "num_input_tokens_seen": 2193248, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.8619854721549637, | |
| "grad_norm": 1.00425386428833, | |
| "learning_rate": 2.4287604514462152e-05, | |
| "loss": 0.0957, | |
| "num_input_tokens_seen": 2217568, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8716707021791767, | |
| "grad_norm": 1.9153094291687012, | |
| "learning_rate": 2.416760571146774e-05, | |
| "loss": 0.0975, | |
| "num_input_tokens_seen": 2242048, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8813559322033898, | |
| "grad_norm": 2.3558013439178467, | |
| "learning_rate": 2.4046662956189898e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 2266112, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8910411622276029, | |
| "grad_norm": 2.546351909637451, | |
| "learning_rate": 2.3924788701628197e-05, | |
| "loss": 0.0688, | |
| "num_input_tokens_seen": 2290720, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9007263922518159, | |
| "grad_norm": 1.2526168823242188, | |
| "learning_rate": 2.3801995496695028e-05, | |
| "loss": 0.1141, | |
| "num_input_tokens_seen": 2315488, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.910411622276029, | |
| "grad_norm": 2.134089231491089, | |
| "learning_rate": 2.367829598492348e-05, | |
| "loss": 0.1328, | |
| "num_input_tokens_seen": 2340992, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9200968523002422, | |
| "grad_norm": 1.332915186882019, | |
| "learning_rate": 2.3553702903165502e-05, | |
| "loss": 0.1, | |
| "num_input_tokens_seen": 2366880, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9297820823244553, | |
| "grad_norm": 1.5140970945358276, | |
| "learning_rate": 2.3428229080280407e-05, | |
| "loss": 0.1089, | |
| "num_input_tokens_seen": 2392000, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9394673123486683, | |
| "grad_norm": 1.531954288482666, | |
| "learning_rate": 2.330188743581398e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 2417472, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.9491525423728814, | |
| "grad_norm": 1.3347736597061157, | |
| "learning_rate": 2.3174690978668155e-05, | |
| "loss": 0.1205, | |
| "num_input_tokens_seen": 2442496, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.9588377723970944, | |
| "grad_norm": 3.1497702598571777, | |
| "learning_rate": 2.3046652805761588e-05, | |
| "loss": 0.1004, | |
| "num_input_tokens_seen": 2467392, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.9685230024213075, | |
| "grad_norm": 1.6756023168563843, | |
| "learning_rate": 2.2917786100681078e-05, | |
| "loss": 0.1007, | |
| "num_input_tokens_seen": 2492768, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9782082324455206, | |
| "grad_norm": 2.56594181060791, | |
| "learning_rate": 2.2788104132324125e-05, | |
| "loss": 0.1179, | |
| "num_input_tokens_seen": 2518176, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9878934624697336, | |
| "grad_norm": 2.1090595722198486, | |
| "learning_rate": 2.2657620253532685e-05, | |
| "loss": 0.0971, | |
| "num_input_tokens_seen": 2543296, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9975786924939467, | |
| "grad_norm": 0.41959595680236816, | |
| "learning_rate": 2.252634789971827e-05, | |
| "loss": 0.0932, | |
| "num_input_tokens_seen": 2567680, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.006779661016949, | |
| "grad_norm": 1.6389803886413574, | |
| "learning_rate": 2.2394300587478566e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 2591016, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0164648910411622, | |
| "grad_norm": 1.4045557975769043, | |
| "learning_rate": 2.2261491913205684e-05, | |
| "loss": 0.0985, | |
| "num_input_tokens_seen": 2615752, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0261501210653754, | |
| "grad_norm": 2.0734925270080566, | |
| "learning_rate": 2.212793555168617e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 2640200, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.0358353510895884, | |
| "grad_norm": 2.1590147018432617, | |
| "learning_rate": 2.1993645254692994e-05, | |
| "loss": 0.116, | |
| "num_input_tokens_seen": 2665416, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.0455205811138015, | |
| "grad_norm": 1.739646553993225, | |
| "learning_rate": 2.1858634849569578e-05, | |
| "loss": 0.0972, | |
| "num_input_tokens_seen": 2690376, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.0552058111380145, | |
| "grad_norm": 0.6458954215049744, | |
| "learning_rate": 2.1722918237806042e-05, | |
| "loss": 0.0884, | |
| "num_input_tokens_seen": 2715080, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.0648910411622277, | |
| "grad_norm": 2.2830138206481934, | |
| "learning_rate": 2.158650939360782e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 2740424, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0745762711864406, | |
| "grad_norm": 1.5225194692611694, | |
| "learning_rate": 2.1449422362456794e-05, | |
| "loss": 0.0813, | |
| "num_input_tokens_seen": 2765640, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.0842615012106538, | |
| "grad_norm": 1.683604121208191, | |
| "learning_rate": 2.13116712596651e-05, | |
| "loss": 0.0953, | |
| "num_input_tokens_seen": 2791176, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0939467312348667, | |
| "grad_norm": 1.5679166316986084, | |
| "learning_rate": 2.1173270268921703e-05, | |
| "loss": 0.0933, | |
| "num_input_tokens_seen": 2816072, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.10363196125908, | |
| "grad_norm": 1.3097947835922241, | |
| "learning_rate": 2.1034233640831988e-05, | |
| "loss": 0.0819, | |
| "num_input_tokens_seen": 2840776, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.113317191283293, | |
| "grad_norm": 0.5728388428688049, | |
| "learning_rate": 2.0894575691450396e-05, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 2865416, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.123002421307506, | |
| "grad_norm": 2.3043558597564697, | |
| "learning_rate": 2.0754310800806395e-05, | |
| "loss": 0.0748, | |
| "num_input_tokens_seen": 2890248, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1326876513317192, | |
| "grad_norm": 1.2087112665176392, | |
| "learning_rate": 2.0613453411423797e-05, | |
| "loss": 0.0959, | |
| "num_input_tokens_seen": 2916392, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.1423728813559322, | |
| "grad_norm": 1.5639240741729736, | |
| "learning_rate": 2.0472018026833684e-05, | |
| "loss": 0.0709, | |
| "num_input_tokens_seen": 2941160, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.1520581113801454, | |
| "grad_norm": 0.5889459848403931, | |
| "learning_rate": 2.0330019210081022e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 2966120, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.1617433414043583, | |
| "grad_norm": 1.854230523109436, | |
| "learning_rate": 2.0187471582225173e-05, | |
| "loss": 0.1005, | |
| "num_input_tokens_seen": 2990088, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1714285714285715, | |
| "grad_norm": 2.01247239112854, | |
| "learning_rate": 2.004438982083442e-05, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 3015400, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.1811138014527844, | |
| "grad_norm": 2.292900323867798, | |
| "learning_rate": 1.9900788658474677e-05, | |
| "loss": 0.0792, | |
| "num_input_tokens_seen": 3039464, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.1907990314769976, | |
| "grad_norm": 1.4194159507751465, | |
| "learning_rate": 1.975668288119252e-05, | |
| "loss": 0.057, | |
| "num_input_tokens_seen": 3063816, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2004842615012106, | |
| "grad_norm": 1.0512489080429077, | |
| "learning_rate": 1.961208732699275e-05, | |
| "loss": 0.102, | |
| "num_input_tokens_seen": 3088968, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.2101694915254237, | |
| "grad_norm": 0.9465106129646301, | |
| "learning_rate": 1.9467016884310565e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 3113736, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.2198547215496367, | |
| "grad_norm": 1.274294376373291, | |
| "learning_rate": 1.9321486490478565e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 3138344, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2295399515738499, | |
| "grad_norm": 1.9390579462051392, | |
| "learning_rate": 1.91755111301887e-05, | |
| "loss": 0.0711, | |
| "num_input_tokens_seen": 3163496, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.239225181598063, | |
| "grad_norm": 1.2855744361877441, | |
| "learning_rate": 1.902910583394938e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 3188392, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.248910411622276, | |
| "grad_norm": 2.931248188018799, | |
| "learning_rate": 1.888228567653781e-05, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 3213224, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.2585956416464892, | |
| "grad_norm": 1.9991300106048584, | |
| "learning_rate": 1.873506577544784e-05, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 3238568, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2682808716707021, | |
| "grad_norm": 1.3530927896499634, | |
| "learning_rate": 1.8587461289333327e-05, | |
| "loss": 0.1043, | |
| "num_input_tokens_seen": 3264264, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.2779661016949153, | |
| "grad_norm": 2.07991099357605, | |
| "learning_rate": 1.8439487416447353e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 3288840, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.2876513317191283, | |
| "grad_norm": 1.8533947467803955, | |
| "learning_rate": 1.8291159393077294e-05, | |
| "loss": 0.0928, | |
| "num_input_tokens_seen": 3313832, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.2973365617433414, | |
| "grad_norm": 1.118119716644287, | |
| "learning_rate": 1.814249249197602e-05, | |
| "loss": 0.0775, | |
| "num_input_tokens_seen": 3337736, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.3070217917675544, | |
| "grad_norm": 2.740079641342163, | |
| "learning_rate": 1.7993502020789294e-05, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 3362024, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.3167070217917676, | |
| "grad_norm": 1.9268351793289185, | |
| "learning_rate": 1.7844203320479614e-05, | |
| "loss": 0.0687, | |
| "num_input_tokens_seen": 3387496, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.3263922518159807, | |
| "grad_norm": 2.3576388359069824, | |
| "learning_rate": 1.7694611763746632e-05, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 3412072, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.3360774818401937, | |
| "grad_norm": 1.127432942390442, | |
| "learning_rate": 1.754474275344427e-05, | |
| "loss": 0.0826, | |
| "num_input_tokens_seen": 3437096, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.3457627118644067, | |
| "grad_norm": 4.377537250518799, | |
| "learning_rate": 1.7394611720994747e-05, | |
| "loss": 0.0445, | |
| "num_input_tokens_seen": 3462120, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.3554479418886198, | |
| "grad_norm": 2.1285200119018555, | |
| "learning_rate": 1.724423412479967e-05, | |
| "loss": 0.0951, | |
| "num_input_tokens_seen": 3486952, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.365133171912833, | |
| "grad_norm": 0.16216270625591278, | |
| "learning_rate": 1.7093625448648348e-05, | |
| "loss": 0.0539, | |
| "num_input_tokens_seen": 3512264, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.374818401937046, | |
| "grad_norm": 2.1299915313720703, | |
| "learning_rate": 1.694280120012349e-05, | |
| "loss": 0.0848, | |
| "num_input_tokens_seen": 3537192, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.3845036319612591, | |
| "grad_norm": 2.476757049560547, | |
| "learning_rate": 1.6791776909004434e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 3560872, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.394188861985472, | |
| "grad_norm": 0.4373377561569214, | |
| "learning_rate": 1.664056812566812e-05, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 3586216, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.4038740920096853, | |
| "grad_norm": 1.9471170902252197, | |
| "learning_rate": 1.648919041948792e-05, | |
| "loss": 0.0798, | |
| "num_input_tokens_seen": 3610792, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.4135593220338982, | |
| "grad_norm": 2.911750316619873, | |
| "learning_rate": 1.6337659377230544e-05, | |
| "loss": 0.0897, | |
| "num_input_tokens_seen": 3634760, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.4232445520581114, | |
| "grad_norm": 2.9474802017211914, | |
| "learning_rate": 1.61859906014511e-05, | |
| "loss": 0.0858, | |
| "num_input_tokens_seen": 3659560, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.4329297820823244, | |
| "grad_norm": 0.6501768827438354, | |
| "learning_rate": 1.6034199708886573e-05, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 3684840, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4426150121065375, | |
| "grad_norm": 1.6708017587661743, | |
| "learning_rate": 1.5882302328847847e-05, | |
| "loss": 0.0842, | |
| "num_input_tokens_seen": 3709096, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.4523002421307507, | |
| "grad_norm": 1.5014967918395996, | |
| "learning_rate": 1.5730314101610376e-05, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 3734728, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4619854721549637, | |
| "grad_norm": 3.2587804794311523, | |
| "learning_rate": 1.5578250676803824e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 3758984, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.4716707021791768, | |
| "grad_norm": 6.304242134094238, | |
| "learning_rate": 1.5426127711800636e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 3784296, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.4813559322033898, | |
| "grad_norm": 1.1681016683578491, | |
| "learning_rate": 1.5273960870103872e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 3809768, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.491041162227603, | |
| "grad_norm": 1.111617922782898, | |
| "learning_rate": 1.5121765819734418e-05, | |
| "loss": 0.071, | |
| "num_input_tokens_seen": 3834536, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.5007263922518161, | |
| "grad_norm": 1.7780523300170898, | |
| "learning_rate": 1.4969558231617681e-05, | |
| "loss": 0.0648, | |
| "num_input_tokens_seen": 3858792, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.510411622276029, | |
| "grad_norm": 2.2017934322357178, | |
| "learning_rate": 1.4817353777970038e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 3883976, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.520096852300242, | |
| "grad_norm": 1.8567978143692017, | |
| "learning_rate": 1.466516813068512e-05, | |
| "loss": 0.0726, | |
| "num_input_tokens_seen": 3908392, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.5297820823244552, | |
| "grad_norm": 2.567291021347046, | |
| "learning_rate": 1.451301695972015e-05, | |
| "loss": 0.0882, | |
| "num_input_tokens_seen": 3932552, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.5394673123486684, | |
| "grad_norm": 1.9968935251235962, | |
| "learning_rate": 1.436091593148244e-05, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 3957672, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.5491525423728814, | |
| "grad_norm": 1.9058917760849, | |
| "learning_rate": 1.4208880707216323e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 3982824, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5588377723970943, | |
| "grad_norm": 1.9218000173568726, | |
| "learning_rate": 1.405692694139054e-05, | |
| "loss": 0.0896, | |
| "num_input_tokens_seen": 4008072, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.5685230024213075, | |
| "grad_norm": 1.5786553621292114, | |
| "learning_rate": 1.3905070280086387e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 4033096, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.5782082324455207, | |
| "grad_norm": 2.503990888595581, | |
| "learning_rate": 1.3753326359386695e-05, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 4058120, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.5878934624697336, | |
| "grad_norm": 1.5616143941879272, | |
| "learning_rate": 1.3601710803765814e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 4082792, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.5975786924939466, | |
| "grad_norm": 1.2533211708068848, | |
| "learning_rate": 1.3450239224480884e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 4107336, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.6072639225181597, | |
| "grad_norm": 1.1046490669250488, | |
| "learning_rate": 1.329892721796433e-05, | |
| "loss": 0.0985, | |
| "num_input_tokens_seen": 4132456, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.616949152542373, | |
| "grad_norm": 1.143494725227356, | |
| "learning_rate": 1.314779036421802e-05, | |
| "loss": 0.0547, | |
| "num_input_tokens_seen": 4156584, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.626634382566586, | |
| "grad_norm": 2.6082706451416016, | |
| "learning_rate": 1.2996844225209033e-05, | |
| "loss": 0.0919, | |
| "num_input_tokens_seen": 4181448, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.636319612590799, | |
| "grad_norm": 2.4191458225250244, | |
| "learning_rate": 1.2846104343267283e-05, | |
| "loss": 0.1204, | |
| "num_input_tokens_seen": 4207560, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.646004842615012, | |
| "grad_norm": 2.051799774169922, | |
| "learning_rate": 1.2695586239485223e-05, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 4232040, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6556900726392252, | |
| "grad_norm": 1.525844931602478, | |
| "learning_rate": 1.254530541211968e-05, | |
| "loss": 0.0805, | |
| "num_input_tokens_seen": 4257576, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.6653753026634384, | |
| "grad_norm": 0.9474373459815979, | |
| "learning_rate": 1.2395277334996045e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 4282472, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.6750605326876513, | |
| "grad_norm": 1.8932424783706665, | |
| "learning_rate": 1.2245517455915036e-05, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 4306792, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.6847457627118643, | |
| "grad_norm": 1.9888746738433838, | |
| "learning_rate": 1.2096041195062051e-05, | |
| "loss": 0.0831, | |
| "num_input_tokens_seen": 4333384, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.6944309927360774, | |
| "grad_norm": 1.8355742692947388, | |
| "learning_rate": 1.1946863943419452e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 4358344, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.7041162227602906, | |
| "grad_norm": 2.8447251319885254, | |
| "learning_rate": 1.1798001061181799e-05, | |
| "loss": 0.0988, | |
| "num_input_tokens_seen": 4381768, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.7138014527845038, | |
| "grad_norm": 2.670257806777954, | |
| "learning_rate": 1.1649467876174252e-05, | |
| "loss": 0.0936, | |
| "num_input_tokens_seen": 4405192, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.7234866828087168, | |
| "grad_norm": 1.188839077949524, | |
| "learning_rate": 1.1501279682274368e-05, | |
| "loss": 0.0901, | |
| "num_input_tokens_seen": 4430344, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.7331719128329297, | |
| "grad_norm": 2.494746685028076, | |
| "learning_rate": 1.1353451737837312e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 4455336, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.7428571428571429, | |
| "grad_norm": 1.3223942518234253, | |
| "learning_rate": 1.1205999264124788e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 4480648, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.752542372881356, | |
| "grad_norm": 1.3812003135681152, | |
| "learning_rate": 1.105893744373776e-05, | |
| "loss": 0.0788, | |
| "num_input_tokens_seen": 4506600, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.762227602905569, | |
| "grad_norm": 0.7805346250534058, | |
| "learning_rate": 1.0912281419053139e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 4531368, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.771912832929782, | |
| "grad_norm": 1.105878472328186, | |
| "learning_rate": 1.0766046290664662e-05, | |
| "loss": 0.0779, | |
| "num_input_tokens_seen": 4555272, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.7815980629539951, | |
| "grad_norm": 1.8672295808792114, | |
| "learning_rate": 1.0620247115828044e-05, | |
| "loss": 0.0838, | |
| "num_input_tokens_seen": 4580328, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.7912832929782083, | |
| "grad_norm": 1.844306468963623, | |
| "learning_rate": 1.047489890691055e-05, | |
| "loss": 0.0594, | |
| "num_input_tokens_seen": 4605768, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.8009685230024213, | |
| "grad_norm": 1.2717005014419556, | |
| "learning_rate": 1.0330016629845276e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 4631048, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.8106537530266342, | |
| "grad_norm": 3.5843582153320312, | |
| "learning_rate": 1.0185615202590144e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 4656456, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.8203389830508474, | |
| "grad_norm": 4.254288673400879, | |
| "learning_rate": 1.004170949359187e-05, | |
| "loss": 0.0654, | |
| "num_input_tokens_seen": 4681384, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8300242130750606, | |
| "grad_norm": 1.351646065711975, | |
| "learning_rate": 9.89831432025501e-06, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 4706216, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.8397094430992738, | |
| "grad_norm": 1.9015384912490845, | |
| "learning_rate": 9.755444447416255e-06, | |
| "loss": 0.0829, | |
| "num_input_tokens_seen": 4730984, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8493946731234867, | |
| "grad_norm": 1.3803085088729858, | |
| "learning_rate": 9.613114585824196e-06, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 4755112, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.8590799031476997, | |
| "grad_norm": 6.487275123596191, | |
| "learning_rate": 9.471339390624574e-06, | |
| "loss": 0.0781, | |
| "num_input_tokens_seen": 4780232, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.8687651331719128, | |
| "grad_norm": 2.182865619659424, | |
| "learning_rate": 9.330133459851323e-06, | |
| "loss": 0.0908, | |
| "num_input_tokens_seen": 4805192, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.878450363196126, | |
| "grad_norm": 0.42010384798049927, | |
| "learning_rate": 9.189511332923463e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 4830856, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.888135593220339, | |
| "grad_norm": 1.609157919883728, | |
| "learning_rate": 9.049487489148008e-06, | |
| "loss": 0.0912, | |
| "num_input_tokens_seen": 4855656, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.897820823244552, | |
| "grad_norm": 2.4291250705718994, | |
| "learning_rate": 8.910076346229134e-06, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 4880392, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.907506053268765, | |
| "grad_norm": 2.243717670440674, | |
| "learning_rate": 8.77129225878361e-06, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 4905320, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.9171912832929783, | |
| "grad_norm": 2.145559072494507, | |
| "learning_rate": 8.633149516862777e-06, | |
| "loss": 0.0839, | |
| "num_input_tokens_seen": 4930536, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9268765133171912, | |
| "grad_norm": 0.6746326088905334, | |
| "learning_rate": 8.495662344481135e-06, | |
| "loss": 0.0527, | |
| "num_input_tokens_seen": 4956168, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.9365617433414044, | |
| "grad_norm": 1.293521761894226, | |
| "learning_rate": 8.358844898151791e-06, | |
| "loss": 0.1033, | |
| "num_input_tokens_seen": 4980584, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3096, | |
| "num_input_tokens_seen": 4980584, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1321214729853338e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |