| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9975786924939465, | |
| "eval_steps": 500, | |
| "global_step": 3096, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009685230024213076, | |
| "grad_norm": 6.778852939605713, | |
| "learning_rate": 2.9999227754514262e-05, | |
| "loss": 0.8519, | |
| "num_input_tokens_seen": 25568, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01937046004842615, | |
| "grad_norm": 3.0029561519622803, | |
| "learning_rate": 2.9996911097572118e-05, | |
| "loss": 0.189, | |
| "num_input_tokens_seen": 51072, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.029055690072639227, | |
| "grad_norm": 5.477710247039795, | |
| "learning_rate": 2.9993050267710624e-05, | |
| "loss": 0.1648, | |
| "num_input_tokens_seen": 76416, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0387409200968523, | |
| "grad_norm": 4.35634183883667, | |
| "learning_rate": 2.9987645662464235e-05, | |
| "loss": 0.1905, | |
| "num_input_tokens_seen": 101344, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.048426150121065374, | |
| "grad_norm": 4.523565292358398, | |
| "learning_rate": 2.9980697838323884e-05, | |
| "loss": 0.1794, | |
| "num_input_tokens_seen": 126656, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05811138014527845, | |
| "grad_norm": 1.9348187446594238, | |
| "learning_rate": 2.9972207510679677e-05, | |
| "loss": 0.1528, | |
| "num_input_tokens_seen": 151200, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06779661016949153, | |
| "grad_norm": 2.981433629989624, | |
| "learning_rate": 2.996217555374725e-05, | |
| "loss": 0.1742, | |
| "num_input_tokens_seen": 175968, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0774818401937046, | |
| "grad_norm": 3.6294591426849365, | |
| "learning_rate": 2.9950603000477722e-05, | |
| "loss": 0.1565, | |
| "num_input_tokens_seen": 201280, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08716707021791767, | |
| "grad_norm": 2.5459301471710205, | |
| "learning_rate": 2.993749104245137e-05, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 226432, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09685230024213075, | |
| "grad_norm": 2.2721059322357178, | |
| "learning_rate": 2.992284102975491e-05, | |
| "loss": 0.1441, | |
| "num_input_tokens_seen": 251744, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10653753026634383, | |
| "grad_norm": 2.0033624172210693, | |
| "learning_rate": 2.9906654470842492e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 276480, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1162227602905569, | |
| "grad_norm": 8.585118293762207, | |
| "learning_rate": 2.9888933032380397e-05, | |
| "loss": 0.1333, | |
| "num_input_tokens_seen": 301664, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12590799031476999, | |
| "grad_norm": 1.423967719078064, | |
| "learning_rate": 2.9869678539075403e-05, | |
| "loss": 0.1728, | |
| "num_input_tokens_seen": 326784, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.13559322033898305, | |
| "grad_norm": 2.6306211948394775, | |
| "learning_rate": 2.9848892973486912e-05, | |
| "loss": 0.1281, | |
| "num_input_tokens_seen": 351328, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14527845036319612, | |
| "grad_norm": 2.5618090629577637, | |
| "learning_rate": 2.9826578475822825e-05, | |
| "loss": 0.1136, | |
| "num_input_tokens_seen": 376000, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1549636803874092, | |
| "grad_norm": 2.694077730178833, | |
| "learning_rate": 2.980273734371914e-05, | |
| "loss": 0.1277, | |
| "num_input_tokens_seen": 400384, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.16464891041162227, | |
| "grad_norm": 2.632338047027588, | |
| "learning_rate": 2.9777372032003423e-05, | |
| "loss": 0.1028, | |
| "num_input_tokens_seen": 426432, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.17433414043583534, | |
| "grad_norm": 2.3446829319000244, | |
| "learning_rate": 2.975048515244199e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 451712, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.18401937046004843, | |
| "grad_norm": 1.8457319736480713, | |
| "learning_rate": 2.9722079473471035e-05, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 476960, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1937046004842615, | |
| "grad_norm": 1.8676010370254517, | |
| "learning_rate": 2.9692157919911536e-05, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 501440, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2033898305084746, | |
| "grad_norm": 4.593673229217529, | |
| "learning_rate": 2.966072357266811e-05, | |
| "loss": 0.1314, | |
| "num_input_tokens_seen": 526656, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.21307506053268765, | |
| "grad_norm": 3.9568676948547363, | |
| "learning_rate": 2.9627779668411795e-05, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 552544, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.22276029055690072, | |
| "grad_norm": 2.4331846237182617, | |
| "learning_rate": 2.9593329599246766e-05, | |
| "loss": 0.115, | |
| "num_input_tokens_seen": 577472, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2324455205811138, | |
| "grad_norm": 2.525543212890625, | |
| "learning_rate": 2.955737691236108e-05, | |
| "loss": 0.1158, | |
| "num_input_tokens_seen": 601856, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.24213075060532688, | |
| "grad_norm": 2.2355105876922607, | |
| "learning_rate": 2.9519925309661422e-05, | |
| "loss": 0.111, | |
| "num_input_tokens_seen": 627904, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.25181598062953997, | |
| "grad_norm": 4.165389537811279, | |
| "learning_rate": 2.948097864739194e-05, | |
| "loss": 0.1314, | |
| "num_input_tokens_seen": 651936, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.26150121065375304, | |
| "grad_norm": 3.1712851524353027, | |
| "learning_rate": 2.944054093573719e-05, | |
| "loss": 0.143, | |
| "num_input_tokens_seen": 676416, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 2.881716728210449, | |
| "learning_rate": 2.93986163384092e-05, | |
| "loss": 0.1121, | |
| "num_input_tokens_seen": 700832, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.28087167070217917, | |
| "grad_norm": 3.060872793197632, | |
| "learning_rate": 2.9355209172218777e-05, | |
| "loss": 0.1159, | |
| "num_input_tokens_seen": 725824, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.29055690072639223, | |
| "grad_norm": 4.449444770812988, | |
| "learning_rate": 2.931032390663101e-05, | |
| "loss": 0.133, | |
| "num_input_tokens_seen": 749408, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.30024213075060535, | |
| "grad_norm": 5.323568344116211, | |
| "learning_rate": 2.926396516330506e-05, | |
| "loss": 0.1172, | |
| "num_input_tokens_seen": 773984, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3099273607748184, | |
| "grad_norm": 3.144500732421875, | |
| "learning_rate": 2.921613771561829e-05, | |
| "loss": 0.136, | |
| "num_input_tokens_seen": 799168, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3196125907990315, | |
| "grad_norm": 2.433586359024048, | |
| "learning_rate": 2.916684648817478e-05, | |
| "loss": 0.0973, | |
| "num_input_tokens_seen": 824320, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.32929782082324455, | |
| "grad_norm": 3.349472761154175, | |
| "learning_rate": 2.9116096556298256e-05, | |
| "loss": 0.13, | |
| "num_input_tokens_seen": 849632, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 1.8927061557769775, | |
| "learning_rate": 2.9063893145509475e-05, | |
| "loss": 0.1257, | |
| "num_input_tokens_seen": 874400, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3486682808716707, | |
| "grad_norm": 3.972686529159546, | |
| "learning_rate": 2.901024163098822e-05, | |
| "loss": 0.1155, | |
| "num_input_tokens_seen": 899264, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3583535108958838, | |
| "grad_norm": 1.177282452583313, | |
| "learning_rate": 2.8955147537019815e-05, | |
| "loss": 0.1251, | |
| "num_input_tokens_seen": 924544, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.36803874092009686, | |
| "grad_norm": 1.9911576509475708, | |
| "learning_rate": 2.88986165364263e-05, | |
| "loss": 0.1147, | |
| "num_input_tokens_seen": 949792, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.37772397094430993, | |
| "grad_norm": 2.402615785598755, | |
| "learning_rate": 2.8840654449982344e-05, | |
| "loss": 0.1433, | |
| "num_input_tokens_seen": 974112, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.387409200968523, | |
| "grad_norm": 1.3184998035430908, | |
| "learning_rate": 2.8781267245815898e-05, | |
| "loss": 0.1117, | |
| "num_input_tokens_seen": 999168, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.39709443099273606, | |
| "grad_norm": 1.9284625053405762, | |
| "learning_rate": 2.8720461038793672e-05, | |
| "loss": 0.1353, | |
| "num_input_tokens_seen": 1024320, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4067796610169492, | |
| "grad_norm": 3.1020259857177734, | |
| "learning_rate": 2.8658242089891515e-05, | |
| "loss": 0.1165, | |
| "num_input_tokens_seen": 1049088, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.41646489104116224, | |
| "grad_norm": 2.203179359436035, | |
| "learning_rate": 2.8594616805549752e-05, | |
| "loss": 0.1215, | |
| "num_input_tokens_seen": 1073632, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4261501210653753, | |
| "grad_norm": 2.053194522857666, | |
| "learning_rate": 2.8529591737013526e-05, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 1098208, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4358353510895884, | |
| "grad_norm": 2.780935049057007, | |
| "learning_rate": 2.8463173579658258e-05, | |
| "loss": 0.0879, | |
| "num_input_tokens_seen": 1122336, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.44552058111380144, | |
| "grad_norm": 1.9929611682891846, | |
| "learning_rate": 2.8395369172300235e-05, | |
| "loss": 0.1141, | |
| "num_input_tokens_seen": 1147392, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4552058111380145, | |
| "grad_norm": 1.1469779014587402, | |
| "learning_rate": 2.8326185496492464e-05, | |
| "loss": 0.1052, | |
| "num_input_tokens_seen": 1173248, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4648910411622276, | |
| "grad_norm": 2.501117706298828, | |
| "learning_rate": 2.825562967580579e-05, | |
| "loss": 0.1086, | |
| "num_input_tokens_seen": 1197984, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4745762711864407, | |
| "grad_norm": 2.0266308784484863, | |
| "learning_rate": 2.8183708975095406e-05, | |
| "loss": 0.1201, | |
| "num_input_tokens_seen": 1222720, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.48426150121065376, | |
| "grad_norm": 1.1120251417160034, | |
| "learning_rate": 2.8110430799752845e-05, | |
| "loss": 0.1319, | |
| "num_input_tokens_seen": 1247232, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4939467312348668, | |
| "grad_norm": 1.2014496326446533, | |
| "learning_rate": 2.8035802694943457e-05, | |
| "loss": 0.1071, | |
| "num_input_tokens_seen": 1273184, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5036319612590799, | |
| "grad_norm": 1.1245245933532715, | |
| "learning_rate": 2.7959832344829512e-05, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 1298688, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.513317191283293, | |
| "grad_norm": 2.031115770339966, | |
| "learning_rate": 2.7882527571779003e-05, | |
| "loss": 0.1196, | |
| "num_input_tokens_seen": 1324128, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5230024213075061, | |
| "grad_norm": 1.7691289186477661, | |
| "learning_rate": 2.78038963355602e-05, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 1349120, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5326876513317191, | |
| "grad_norm": 2.9496989250183105, | |
| "learning_rate": 2.7723946732522055e-05, | |
| "loss": 0.1109, | |
| "num_input_tokens_seen": 1374304, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 2.2881715297698975, | |
| "learning_rate": 2.764268699476058e-05, | |
| "loss": 0.1274, | |
| "num_input_tokens_seen": 1399136, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5520581113801453, | |
| "grad_norm": 1.9754095077514648, | |
| "learning_rate": 2.756012548927119e-05, | |
| "loss": 0.1397, | |
| "num_input_tokens_seen": 1424672, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5617433414043583, | |
| "grad_norm": 1.9883428812026978, | |
| "learning_rate": 2.7476270717087215e-05, | |
| "loss": 0.101, | |
| "num_input_tokens_seen": 1449024, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.9653130769729614, | |
| "learning_rate": 2.7391131312404556e-05, | |
| "loss": 0.0941, | |
| "num_input_tokens_seen": 1475264, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5811138014527845, | |
| "grad_norm": 4.576601028442383, | |
| "learning_rate": 2.7304716041692663e-05, | |
| "loss": 0.0865, | |
| "num_input_tokens_seen": 1500064, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5907990314769975, | |
| "grad_norm": 2.4046311378479004, | |
| "learning_rate": 2.7217033802791906e-05, | |
| "loss": 0.1596, | |
| "num_input_tokens_seen": 1524448, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6004842615012107, | |
| "grad_norm": 1.7785555124282837, | |
| "learning_rate": 2.7128093623997368e-05, | |
| "loss": 0.0891, | |
| "num_input_tokens_seen": 1549536, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6101694915254238, | |
| "grad_norm": 2.2736170291900635, | |
| "learning_rate": 2.7037904663129262e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 1573408, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6198547215496368, | |
| "grad_norm": 1.0862345695495605, | |
| "learning_rate": 2.6946476206589972e-05, | |
| "loss": 0.1023, | |
| "num_input_tokens_seen": 1597888, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6295399515738499, | |
| "grad_norm": 0.5358290672302246, | |
| "learning_rate": 2.6853817668407875e-05, | |
| "loss": 0.0669, | |
| "num_input_tokens_seen": 1623296, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.639225181598063, | |
| "grad_norm": 2.3138749599456787, | |
| "learning_rate": 2.6759938589268023e-05, | |
| "loss": 0.1017, | |
| "num_input_tokens_seen": 1649216, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.648910411622276, | |
| "grad_norm": 3.2054226398468018, | |
| "learning_rate": 2.6664848635529742e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 1673760, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6585956416464891, | |
| "grad_norm": 1.8352829217910767, | |
| "learning_rate": 2.6568557598231385e-05, | |
| "loss": 0.1081, | |
| "num_input_tokens_seen": 1698592, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6682808716707022, | |
| "grad_norm": 1.203284740447998, | |
| "learning_rate": 2.6471075392082125e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 1723296, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 1.635628581047058, | |
| "learning_rate": 2.6372412054441116e-05, | |
| "loss": 0.1216, | |
| "num_input_tokens_seen": 1748384, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6876513317191283, | |
| "grad_norm": 0.8993457555770874, | |
| "learning_rate": 2.6272577744283965e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 1773600, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6973365617433414, | |
| "grad_norm": 1.7306419610977173, | |
| "learning_rate": 2.617158274115673e-05, | |
| "loss": 0.1034, | |
| "num_input_tokens_seen": 1798656, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7070217917675545, | |
| "grad_norm": 2.770066976547241, | |
| "learning_rate": 2.6069437444117432e-05, | |
| "loss": 0.0872, | |
| "num_input_tokens_seen": 1824544, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7167070217917676, | |
| "grad_norm": 2.3590221405029297, | |
| "learning_rate": 2.596615237066535e-05, | |
| "loss": 0.1063, | |
| "num_input_tokens_seen": 1848896, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7263922518159807, | |
| "grad_norm": 1.0496519804000854, | |
| "learning_rate": 2.586173815565805e-05, | |
| "loss": 0.1104, | |
| "num_input_tokens_seen": 1873248, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7360774818401937, | |
| "grad_norm": 1.513573408126831, | |
| "learning_rate": 2.575620555021634e-05, | |
| "loss": 0.1125, | |
| "num_input_tokens_seen": 1897184, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.7457627118644068, | |
| "grad_norm": 1.5545728206634521, | |
| "learning_rate": 2.564956542061732e-05, | |
| "loss": 0.0969, | |
| "num_input_tokens_seen": 1922368, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.7554479418886199, | |
| "grad_norm": 1.9260263442993164, | |
| "learning_rate": 2.5541828747175477e-05, | |
| "loss": 0.1142, | |
| "num_input_tokens_seen": 1947904, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7651331719128329, | |
| "grad_norm": 2.396538734436035, | |
| "learning_rate": 2.543300662311211e-05, | |
| "loss": 0.0926, | |
| "num_input_tokens_seen": 1971872, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.774818401937046, | |
| "grad_norm": 1.7069965600967407, | |
| "learning_rate": 2.532311025341309e-05, | |
| "loss": 0.0802, | |
| "num_input_tokens_seen": 1996352, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.784503631961259, | |
| "grad_norm": 5.540910243988037, | |
| "learning_rate": 2.5212150953675133e-05, | |
| "loss": 0.1248, | |
| "num_input_tokens_seen": 2020480, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7941888619854721, | |
| "grad_norm": 1.7795952558517456, | |
| "learning_rate": 2.5100140148940688e-05, | |
| "loss": 0.0767, | |
| "num_input_tokens_seen": 2044448, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8038740920096852, | |
| "grad_norm": 2.7387983798980713, | |
| "learning_rate": 2.498708937252153e-05, | |
| "loss": 0.1239, | |
| "num_input_tokens_seen": 2070400, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 2.1243462562561035, | |
| "learning_rate": 2.4873010264811222e-05, | |
| "loss": 0.108, | |
| "num_input_tokens_seen": 2095392, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8232445520581114, | |
| "grad_norm": 0.9928631782531738, | |
| "learning_rate": 2.4757914572086555e-05, | |
| "loss": 0.0994, | |
| "num_input_tokens_seen": 2120192, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8329297820823245, | |
| "grad_norm": 6.047460556030273, | |
| "learning_rate": 2.464181414529809e-05, | |
| "loss": 0.0927, | |
| "num_input_tokens_seen": 2144384, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8426150121065376, | |
| "grad_norm": 2.2197115421295166, | |
| "learning_rate": 2.4524720938849883e-05, | |
| "loss": 0.1328, | |
| "num_input_tokens_seen": 2168704, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.8523002421307506, | |
| "grad_norm": 2.0752601623535156, | |
| "learning_rate": 2.440664700936861e-05, | |
| "loss": 0.1229, | |
| "num_input_tokens_seen": 2193248, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.8619854721549637, | |
| "grad_norm": 1.00425386428833, | |
| "learning_rate": 2.4287604514462152e-05, | |
| "loss": 0.0957, | |
| "num_input_tokens_seen": 2217568, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.8716707021791767, | |
| "grad_norm": 1.9153094291687012, | |
| "learning_rate": 2.416760571146774e-05, | |
| "loss": 0.0975, | |
| "num_input_tokens_seen": 2242048, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8813559322033898, | |
| "grad_norm": 2.3558013439178467, | |
| "learning_rate": 2.4046662956189898e-05, | |
| "loss": 0.1068, | |
| "num_input_tokens_seen": 2266112, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8910411622276029, | |
| "grad_norm": 2.546351909637451, | |
| "learning_rate": 2.3924788701628197e-05, | |
| "loss": 0.0688, | |
| "num_input_tokens_seen": 2290720, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9007263922518159, | |
| "grad_norm": 1.2526168823242188, | |
| "learning_rate": 2.3801995496695028e-05, | |
| "loss": 0.1141, | |
| "num_input_tokens_seen": 2315488, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.910411622276029, | |
| "grad_norm": 2.134089231491089, | |
| "learning_rate": 2.367829598492348e-05, | |
| "loss": 0.1328, | |
| "num_input_tokens_seen": 2340992, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9200968523002422, | |
| "grad_norm": 1.332915186882019, | |
| "learning_rate": 2.3553702903165502e-05, | |
| "loss": 0.1, | |
| "num_input_tokens_seen": 2366880, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9297820823244553, | |
| "grad_norm": 1.5140970945358276, | |
| "learning_rate": 2.3428229080280407e-05, | |
| "loss": 0.1089, | |
| "num_input_tokens_seen": 2392000, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9394673123486683, | |
| "grad_norm": 1.531954288482666, | |
| "learning_rate": 2.330188743581398e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 2417472, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.9491525423728814, | |
| "grad_norm": 1.3347736597061157, | |
| "learning_rate": 2.3174690978668155e-05, | |
| "loss": 0.1205, | |
| "num_input_tokens_seen": 2442496, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.9588377723970944, | |
| "grad_norm": 3.1497702598571777, | |
| "learning_rate": 2.3046652805761588e-05, | |
| "loss": 0.1004, | |
| "num_input_tokens_seen": 2467392, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.9685230024213075, | |
| "grad_norm": 1.6756023168563843, | |
| "learning_rate": 2.2917786100681078e-05, | |
| "loss": 0.1007, | |
| "num_input_tokens_seen": 2492768, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9782082324455206, | |
| "grad_norm": 2.56594181060791, | |
| "learning_rate": 2.2788104132324125e-05, | |
| "loss": 0.1179, | |
| "num_input_tokens_seen": 2518176, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9878934624697336, | |
| "grad_norm": 2.1090595722198486, | |
| "learning_rate": 2.2657620253532685e-05, | |
| "loss": 0.0971, | |
| "num_input_tokens_seen": 2543296, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9975786924939467, | |
| "grad_norm": 0.41959595680236816, | |
| "learning_rate": 2.252634789971827e-05, | |
| "loss": 0.0932, | |
| "num_input_tokens_seen": 2567680, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.006779661016949, | |
| "grad_norm": 1.6389803886413574, | |
| "learning_rate": 2.2394300587478566e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 2591016, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0164648910411622, | |
| "grad_norm": 1.4045557975769043, | |
| "learning_rate": 2.2261491913205684e-05, | |
| "loss": 0.0985, | |
| "num_input_tokens_seen": 2615752, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0261501210653754, | |
| "grad_norm": 2.0734925270080566, | |
| "learning_rate": 2.212793555168617e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 2640200, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.0358353510895884, | |
| "grad_norm": 2.1590147018432617, | |
| "learning_rate": 2.1993645254692994e-05, | |
| "loss": 0.116, | |
| "num_input_tokens_seen": 2665416, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.0455205811138015, | |
| "grad_norm": 1.739646553993225, | |
| "learning_rate": 2.1858634849569578e-05, | |
| "loss": 0.0972, | |
| "num_input_tokens_seen": 2690376, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.0552058111380145, | |
| "grad_norm": 0.6458954215049744, | |
| "learning_rate": 2.1722918237806042e-05, | |
| "loss": 0.0884, | |
| "num_input_tokens_seen": 2715080, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.0648910411622277, | |
| "grad_norm": 2.2830138206481934, | |
| "learning_rate": 2.158650939360782e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 2740424, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0745762711864406, | |
| "grad_norm": 1.5225194692611694, | |
| "learning_rate": 2.1449422362456794e-05, | |
| "loss": 0.0813, | |
| "num_input_tokens_seen": 2765640, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.0842615012106538, | |
| "grad_norm": 1.683604121208191, | |
| "learning_rate": 2.13116712596651e-05, | |
| "loss": 0.0953, | |
| "num_input_tokens_seen": 2791176, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.0939467312348667, | |
| "grad_norm": 1.5679166316986084, | |
| "learning_rate": 2.1173270268921703e-05, | |
| "loss": 0.0933, | |
| "num_input_tokens_seen": 2816072, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.10363196125908, | |
| "grad_norm": 1.3097947835922241, | |
| "learning_rate": 2.1034233640831988e-05, | |
| "loss": 0.0819, | |
| "num_input_tokens_seen": 2840776, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.113317191283293, | |
| "grad_norm": 0.5728388428688049, | |
| "learning_rate": 2.0894575691450396e-05, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 2865416, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.123002421307506, | |
| "grad_norm": 2.3043558597564697, | |
| "learning_rate": 2.0754310800806395e-05, | |
| "loss": 0.0748, | |
| "num_input_tokens_seen": 2890248, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1326876513317192, | |
| "grad_norm": 1.2087112665176392, | |
| "learning_rate": 2.0613453411423797e-05, | |
| "loss": 0.0959, | |
| "num_input_tokens_seen": 2916392, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.1423728813559322, | |
| "grad_norm": 1.5639240741729736, | |
| "learning_rate": 2.0472018026833684e-05, | |
| "loss": 0.0709, | |
| "num_input_tokens_seen": 2941160, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.1520581113801454, | |
| "grad_norm": 0.5889459848403931, | |
| "learning_rate": 2.0330019210081022e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 2966120, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.1617433414043583, | |
| "grad_norm": 1.854230523109436, | |
| "learning_rate": 2.0187471582225173e-05, | |
| "loss": 0.1005, | |
| "num_input_tokens_seen": 2990088, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1714285714285715, | |
| "grad_norm": 2.01247239112854, | |
| "learning_rate": 2.004438982083442e-05, | |
| "loss": 0.0579, | |
| "num_input_tokens_seen": 3015400, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.1811138014527844, | |
| "grad_norm": 2.292900323867798, | |
| "learning_rate": 1.9900788658474677e-05, | |
| "loss": 0.0792, | |
| "num_input_tokens_seen": 3039464, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.1907990314769976, | |
| "grad_norm": 1.4194159507751465, | |
| "learning_rate": 1.975668288119252e-05, | |
| "loss": 0.057, | |
| "num_input_tokens_seen": 3063816, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2004842615012106, | |
| "grad_norm": 1.0512489080429077, | |
| "learning_rate": 1.961208732699275e-05, | |
| "loss": 0.102, | |
| "num_input_tokens_seen": 3088968, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.2101694915254237, | |
| "grad_norm": 0.9465106129646301, | |
| "learning_rate": 1.9467016884310565e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 3113736, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.2198547215496367, | |
| "grad_norm": 1.274294376373291, | |
| "learning_rate": 1.9321486490478565e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 3138344, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2295399515738499, | |
| "grad_norm": 1.9390579462051392, | |
| "learning_rate": 1.91755111301887e-05, | |
| "loss": 0.0711, | |
| "num_input_tokens_seen": 3163496, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.239225181598063, | |
| "grad_norm": 1.2855744361877441, | |
| "learning_rate": 1.902910583394938e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 3188392, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.248910411622276, | |
| "grad_norm": 2.931248188018799, | |
| "learning_rate": 1.888228567653781e-05, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 3213224, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.2585956416464892, | |
| "grad_norm": 1.9991300106048584, | |
| "learning_rate": 1.873506577544784e-05, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 3238568, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2682808716707021, | |
| "grad_norm": 1.3530927896499634, | |
| "learning_rate": 1.8587461289333327e-05, | |
| "loss": 0.1043, | |
| "num_input_tokens_seen": 3264264, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.2779661016949153, | |
| "grad_norm": 2.07991099357605, | |
| "learning_rate": 1.8439487416447353e-05, | |
| "loss": 0.1037, | |
| "num_input_tokens_seen": 3288840, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.2876513317191283, | |
| "grad_norm": 1.8533947467803955, | |
| "learning_rate": 1.8291159393077294e-05, | |
| "loss": 0.0928, | |
| "num_input_tokens_seen": 3313832, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.2973365617433414, | |
| "grad_norm": 1.118119716644287, | |
| "learning_rate": 1.814249249197602e-05, | |
| "loss": 0.0775, | |
| "num_input_tokens_seen": 3337736, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.3070217917675544, | |
| "grad_norm": 2.740079641342163, | |
| "learning_rate": 1.7993502020789294e-05, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 3362024, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.3167070217917676, | |
| "grad_norm": 1.9268351793289185, | |
| "learning_rate": 1.7844203320479614e-05, | |
| "loss": 0.0687, | |
| "num_input_tokens_seen": 3387496, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.3263922518159807, | |
| "grad_norm": 2.3576388359069824, | |
| "learning_rate": 1.7694611763746632e-05, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 3412072, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.3360774818401937, | |
| "grad_norm": 1.127432942390442, | |
| "learning_rate": 1.754474275344427e-05, | |
| "loss": 0.0826, | |
| "num_input_tokens_seen": 3437096, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.3457627118644067, | |
| "grad_norm": 4.377537250518799, | |
| "learning_rate": 1.7394611720994747e-05, | |
| "loss": 0.0445, | |
| "num_input_tokens_seen": 3462120, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.3554479418886198, | |
| "grad_norm": 2.1285200119018555, | |
| "learning_rate": 1.724423412479967e-05, | |
| "loss": 0.0951, | |
| "num_input_tokens_seen": 3486952, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.365133171912833, | |
| "grad_norm": 0.16216270625591278, | |
| "learning_rate": 1.7093625448648348e-05, | |
| "loss": 0.0539, | |
| "num_input_tokens_seen": 3512264, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.374818401937046, | |
| "grad_norm": 2.1299915313720703, | |
| "learning_rate": 1.694280120012349e-05, | |
| "loss": 0.0848, | |
| "num_input_tokens_seen": 3537192, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.3845036319612591, | |
| "grad_norm": 2.476757049560547, | |
| "learning_rate": 1.6791776909004434e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 3560872, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.394188861985472, | |
| "grad_norm": 0.4373377561569214, | |
| "learning_rate": 1.664056812566812e-05, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 3586216, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.4038740920096853, | |
| "grad_norm": 1.9471170902252197, | |
| "learning_rate": 1.648919041948792e-05, | |
| "loss": 0.0798, | |
| "num_input_tokens_seen": 3610792, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.4135593220338982, | |
| "grad_norm": 2.911750316619873, | |
| "learning_rate": 1.6337659377230544e-05, | |
| "loss": 0.0897, | |
| "num_input_tokens_seen": 3634760, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.4232445520581114, | |
| "grad_norm": 2.9474802017211914, | |
| "learning_rate": 1.61859906014511e-05, | |
| "loss": 0.0858, | |
| "num_input_tokens_seen": 3659560, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.4329297820823244, | |
| "grad_norm": 0.6501768827438354, | |
| "learning_rate": 1.6034199708886573e-05, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 3684840, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4426150121065375, | |
| "grad_norm": 1.6708017587661743, | |
| "learning_rate": 1.5882302328847847e-05, | |
| "loss": 0.0842, | |
| "num_input_tokens_seen": 3709096, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.4523002421307507, | |
| "grad_norm": 1.5014967918395996, | |
| "learning_rate": 1.5730314101610376e-05, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 3734728, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4619854721549637, | |
| "grad_norm": 3.2587804794311523, | |
| "learning_rate": 1.5578250676803824e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 3758984, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.4716707021791768, | |
| "grad_norm": 6.304242134094238, | |
| "learning_rate": 1.5426127711800636e-05, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 3784296, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.4813559322033898, | |
| "grad_norm": 1.1681016683578491, | |
| "learning_rate": 1.5273960870103872e-05, | |
| "loss": 0.0705, | |
| "num_input_tokens_seen": 3809768, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.491041162227603, | |
| "grad_norm": 1.111617922782898, | |
| "learning_rate": 1.5121765819734418e-05, | |
| "loss": 0.071, | |
| "num_input_tokens_seen": 3834536, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.5007263922518161, | |
| "grad_norm": 1.7780523300170898, | |
| "learning_rate": 1.4969558231617681e-05, | |
| "loss": 0.0648, | |
| "num_input_tokens_seen": 3858792, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.510411622276029, | |
| "grad_norm": 2.2017934322357178, | |
| "learning_rate": 1.4817353777970038e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 3883976, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.520096852300242, | |
| "grad_norm": 1.8567978143692017, | |
| "learning_rate": 1.466516813068512e-05, | |
| "loss": 0.0726, | |
| "num_input_tokens_seen": 3908392, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.5297820823244552, | |
| "grad_norm": 2.567291021347046, | |
| "learning_rate": 1.451301695972015e-05, | |
| "loss": 0.0882, | |
| "num_input_tokens_seen": 3932552, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.5394673123486684, | |
| "grad_norm": 1.9968935251235962, | |
| "learning_rate": 1.436091593148244e-05, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 3957672, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.5491525423728814, | |
| "grad_norm": 1.9058917760849, | |
| "learning_rate": 1.4208880707216323e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 3982824, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5588377723970943, | |
| "grad_norm": 1.9218000173568726, | |
| "learning_rate": 1.405692694139054e-05, | |
| "loss": 0.0896, | |
| "num_input_tokens_seen": 4008072, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.5685230024213075, | |
| "grad_norm": 1.5786553621292114, | |
| "learning_rate": 1.3905070280086387e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 4033096, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.5782082324455207, | |
| "grad_norm": 2.503990888595581, | |
| "learning_rate": 1.3753326359386695e-05, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 4058120, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.5878934624697336, | |
| "grad_norm": 1.5616143941879272, | |
| "learning_rate": 1.3601710803765814e-05, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 4082792, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.5975786924939466, | |
| "grad_norm": 1.2533211708068848, | |
| "learning_rate": 1.3450239224480884e-05, | |
| "loss": 0.0605, | |
| "num_input_tokens_seen": 4107336, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.6072639225181597, | |
| "grad_norm": 1.1046490669250488, | |
| "learning_rate": 1.329892721796433e-05, | |
| "loss": 0.0985, | |
| "num_input_tokens_seen": 4132456, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.616949152542373, | |
| "grad_norm": 1.143494725227356, | |
| "learning_rate": 1.314779036421802e-05, | |
| "loss": 0.0547, | |
| "num_input_tokens_seen": 4156584, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.626634382566586, | |
| "grad_norm": 2.6082706451416016, | |
| "learning_rate": 1.2996844225209033e-05, | |
| "loss": 0.0919, | |
| "num_input_tokens_seen": 4181448, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.636319612590799, | |
| "grad_norm": 2.4191458225250244, | |
| "learning_rate": 1.2846104343267283e-05, | |
| "loss": 0.1204, | |
| "num_input_tokens_seen": 4207560, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.646004842615012, | |
| "grad_norm": 2.051799774169922, | |
| "learning_rate": 1.2695586239485223e-05, | |
| "loss": 0.0664, | |
| "num_input_tokens_seen": 4232040, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6556900726392252, | |
| "grad_norm": 1.525844931602478, | |
| "learning_rate": 1.254530541211968e-05, | |
| "loss": 0.0805, | |
| "num_input_tokens_seen": 4257576, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.6653753026634384, | |
| "grad_norm": 0.9474373459815979, | |
| "learning_rate": 1.2395277334996045e-05, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 4282472, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.6750605326876513, | |
| "grad_norm": 1.8932424783706665, | |
| "learning_rate": 1.2245517455915036e-05, | |
| "loss": 0.0734, | |
| "num_input_tokens_seen": 4306792, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.6847457627118643, | |
| "grad_norm": 1.9888746738433838, | |
| "learning_rate": 1.2096041195062051e-05, | |
| "loss": 0.0831, | |
| "num_input_tokens_seen": 4333384, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.6944309927360774, | |
| "grad_norm": 1.8355742692947388, | |
| "learning_rate": 1.1946863943419452e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 4358344, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.7041162227602906, | |
| "grad_norm": 2.8447251319885254, | |
| "learning_rate": 1.1798001061181799e-05, | |
| "loss": 0.0988, | |
| "num_input_tokens_seen": 4381768, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.7138014527845038, | |
| "grad_norm": 2.670257806777954, | |
| "learning_rate": 1.1649467876174252e-05, | |
| "loss": 0.0936, | |
| "num_input_tokens_seen": 4405192, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.7234866828087168, | |
| "grad_norm": 1.188839077949524, | |
| "learning_rate": 1.1501279682274368e-05, | |
| "loss": 0.0901, | |
| "num_input_tokens_seen": 4430344, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.7331719128329297, | |
| "grad_norm": 2.494746685028076, | |
| "learning_rate": 1.1353451737837312e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 4455336, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.7428571428571429, | |
| "grad_norm": 1.3223942518234253, | |
| "learning_rate": 1.1205999264124788e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 4480648, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.752542372881356, | |
| "grad_norm": 1.3812003135681152, | |
| "learning_rate": 1.105893744373776e-05, | |
| "loss": 0.0788, | |
| "num_input_tokens_seen": 4506600, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.762227602905569, | |
| "grad_norm": 0.7805346250534058, | |
| "learning_rate": 1.0912281419053139e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 4531368, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.771912832929782, | |
| "grad_norm": 1.105878472328186, | |
| "learning_rate": 1.0766046290664662e-05, | |
| "loss": 0.0779, | |
| "num_input_tokens_seen": 4555272, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.7815980629539951, | |
| "grad_norm": 1.8672295808792114, | |
| "learning_rate": 1.0620247115828044e-05, | |
| "loss": 0.0838, | |
| "num_input_tokens_seen": 4580328, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.7912832929782083, | |
| "grad_norm": 1.844306468963623, | |
| "learning_rate": 1.047489890691055e-05, | |
| "loss": 0.0594, | |
| "num_input_tokens_seen": 4605768, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.8009685230024213, | |
| "grad_norm": 1.2717005014419556, | |
| "learning_rate": 1.0330016629845276e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 4631048, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.8106537530266342, | |
| "grad_norm": 3.5843582153320312, | |
| "learning_rate": 1.0185615202590144e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 4656456, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.8203389830508474, | |
| "grad_norm": 4.254288673400879, | |
| "learning_rate": 1.004170949359187e-05, | |
| "loss": 0.0654, | |
| "num_input_tokens_seen": 4681384, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8300242130750606, | |
| "grad_norm": 1.351646065711975, | |
| "learning_rate": 9.89831432025501e-06, | |
| "loss": 0.0712, | |
| "num_input_tokens_seen": 4706216, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.8397094430992738, | |
| "grad_norm": 1.9015384912490845, | |
| "learning_rate": 9.755444447416255e-06, | |
| "loss": 0.0829, | |
| "num_input_tokens_seen": 4730984, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8493946731234867, | |
| "grad_norm": 1.3803085088729858, | |
| "learning_rate": 9.613114585824196e-06, | |
| "loss": 0.0532, | |
| "num_input_tokens_seen": 4755112, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.8590799031476997, | |
| "grad_norm": 6.487275123596191, | |
| "learning_rate": 9.471339390624574e-06, | |
| "loss": 0.0781, | |
| "num_input_tokens_seen": 4780232, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.8687651331719128, | |
| "grad_norm": 2.182865619659424, | |
| "learning_rate": 9.330133459851323e-06, | |
| "loss": 0.0908, | |
| "num_input_tokens_seen": 4805192, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.878450363196126, | |
| "grad_norm": 0.42010384798049927, | |
| "learning_rate": 9.189511332923463e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 4830856, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.888135593220339, | |
| "grad_norm": 1.609157919883728, | |
| "learning_rate": 9.049487489148008e-06, | |
| "loss": 0.0912, | |
| "num_input_tokens_seen": 4855656, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.897820823244552, | |
| "grad_norm": 2.4291250705718994, | |
| "learning_rate": 8.910076346229134e-06, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 4880392, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.907506053268765, | |
| "grad_norm": 2.243717670440674, | |
| "learning_rate": 8.77129225878361e-06, | |
| "loss": 0.1066, | |
| "num_input_tokens_seen": 4905320, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.9171912832929783, | |
| "grad_norm": 2.145559072494507, | |
| "learning_rate": 8.633149516862777e-06, | |
| "loss": 0.0839, | |
| "num_input_tokens_seen": 4930536, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9268765133171912, | |
| "grad_norm": 0.6746326088905334, | |
| "learning_rate": 8.495662344481135e-06, | |
| "loss": 0.0527, | |
| "num_input_tokens_seen": 4956168, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.9365617433414044, | |
| "grad_norm": 1.293521761894226, | |
| "learning_rate": 8.358844898151791e-06, | |
| "loss": 0.1033, | |
| "num_input_tokens_seen": 4980584, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9462469733656174, | |
| "grad_norm": 1.7922570705413818, | |
| "learning_rate": 8.222711265428779e-06, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 5005992, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.9559322033898305, | |
| "grad_norm": 1.0770626068115234, | |
| "learning_rate": 8.087275463456548e-06, | |
| "loss": 0.0652, | |
| "num_input_tokens_seen": 5032168, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.9656174334140437, | |
| "grad_norm": 0.7968271374702454, | |
| "learning_rate": 7.952551437526648e-06, | |
| "loss": 0.0593, | |
| "num_input_tokens_seen": 5056296, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.9753026634382567, | |
| "grad_norm": 2.140667676925659, | |
| "learning_rate": 7.818553059641868e-06, | |
| "loss": 0.0933, | |
| "num_input_tokens_seen": 5080424, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.9849878934624696, | |
| "grad_norm": 2.905066967010498, | |
| "learning_rate": 7.685294127087852e-06, | |
| "loss": 0.059, | |
| "num_input_tokens_seen": 5104904, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.9946731234866828, | |
| "grad_norm": 2.5095653533935547, | |
| "learning_rate": 7.552788361012486e-06, | |
| "loss": 0.0766, | |
| "num_input_tokens_seen": 5129064, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.0038740920096854, | |
| "grad_norm": 1.0241445302963257, | |
| "learning_rate": 7.421049405013061e-06, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 5152120, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.013559322033898, | |
| "grad_norm": 1.7620762586593628, | |
| "learning_rate": 7.290090823731452e-06, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 5176728, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.0232445520581113, | |
| "grad_norm": 1.1471503973007202, | |
| "learning_rate": 7.159926101457423e-06, | |
| "loss": 0.0586, | |
| "num_input_tokens_seen": 5201176, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.0329297820823244, | |
| "grad_norm": 1.4868978261947632, | |
| "learning_rate": 7.030568640740202e-06, | |
| "loss": 0.0382, | |
| "num_input_tokens_seen": 5225368, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0426150121065376, | |
| "grad_norm": 0.8362380266189575, | |
| "learning_rate": 6.902031761008456e-06, | |
| "loss": 0.0597, | |
| "num_input_tokens_seen": 5250136, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.052300242130751, | |
| "grad_norm": 2.6067404747009277, | |
| "learning_rate": 6.774328697198879e-06, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 5274264, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.0619854721549635, | |
| "grad_norm": 1.6327483654022217, | |
| "learning_rate": 6.647472598393399e-06, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 5298264, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.0716707021791767, | |
| "grad_norm": 1.461899995803833, | |
| "learning_rate": 6.521476526465309e-06, | |
| "loss": 0.0426, | |
| "num_input_tokens_seen": 5322872, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.08135593220339, | |
| "grad_norm": 2.3133087158203125, | |
| "learning_rate": 6.3963534547343126e-06, | |
| "loss": 0.0706, | |
| "num_input_tokens_seen": 5348120, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.091041162227603, | |
| "grad_norm": 3.1375937461853027, | |
| "learning_rate": 6.27211626663071e-06, | |
| "loss": 0.0377, | |
| "num_input_tokens_seen": 5373240, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.100726392251816, | |
| "grad_norm": 2.147362470626831, | |
| "learning_rate": 6.148777754368862e-06, | |
| "loss": 0.0608, | |
| "num_input_tokens_seen": 5398296, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.110411622276029, | |
| "grad_norm": 0.6415455341339111, | |
| "learning_rate": 6.026350617630011e-06, | |
| "loss": 0.0334, | |
| "num_input_tokens_seen": 5424408, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.120096852300242, | |
| "grad_norm": 3.5363268852233887, | |
| "learning_rate": 5.904847462254646e-06, | |
| "loss": 0.0445, | |
| "num_input_tokens_seen": 5449880, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.1297820823244553, | |
| "grad_norm": 2.8637278079986572, | |
| "learning_rate": 5.784280798944537e-06, | |
| "loss": 0.0735, | |
| "num_input_tokens_seen": 5474808, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.1394673123486685, | |
| "grad_norm": 1.1030181646347046, | |
| "learning_rate": 5.6646630419745404e-06, | |
| "loss": 0.056, | |
| "num_input_tokens_seen": 5499672, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.1491525423728812, | |
| "grad_norm": 1.6034140586853027, | |
| "learning_rate": 5.5460065079143694e-06, | |
| "loss": 0.0703, | |
| "num_input_tokens_seen": 5523672, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.1588377723970944, | |
| "grad_norm": 4.010861396789551, | |
| "learning_rate": 5.428323414360401e-06, | |
| "loss": 0.0504, | |
| "num_input_tokens_seen": 5548664, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.1685230024213076, | |
| "grad_norm": 2.1378917694091797, | |
| "learning_rate": 5.311625878677658e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 5573944, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.1782082324455208, | |
| "grad_norm": 1.6304939985275269, | |
| "learning_rate": 5.195925916752166e-06, | |
| "loss": 0.045, | |
| "num_input_tokens_seen": 5599224, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1878934624697335, | |
| "grad_norm": 1.6586905717849731, | |
| "learning_rate": 5.081235441753685e-06, | |
| "loss": 0.0483, | |
| "num_input_tokens_seen": 5623864, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.1975786924939467, | |
| "grad_norm": 2.3342106342315674, | |
| "learning_rate": 4.9675662629091055e-06, | |
| "loss": 0.0476, | |
| "num_input_tokens_seen": 5648760, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.20726392251816, | |
| "grad_norm": 1.122441291809082, | |
| "learning_rate": 4.854930084286458e-06, | |
| "loss": 0.0537, | |
| "num_input_tokens_seen": 5673720, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.216949152542373, | |
| "grad_norm": 0.22967131435871124, | |
| "learning_rate": 4.743338503589796e-06, | |
| "loss": 0.0567, | |
| "num_input_tokens_seen": 5697784, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.226634382566586, | |
| "grad_norm": 3.79902720451355, | |
| "learning_rate": 4.632803010965056e-06, | |
| "loss": 0.0502, | |
| "num_input_tokens_seen": 5722040, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.236319612590799, | |
| "grad_norm": 0.5887905359268188, | |
| "learning_rate": 4.523334987816917e-06, | |
| "loss": 0.0444, | |
| "num_input_tokens_seen": 5747672, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.246004842615012, | |
| "grad_norm": 1.776781678199768, | |
| "learning_rate": 4.414945705636949e-06, | |
| "loss": 0.0482, | |
| "num_input_tokens_seen": 5772056, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.2556900726392253, | |
| "grad_norm": 2.457751512527466, | |
| "learning_rate": 4.307646324843004e-06, | |
| "loss": 0.0398, | |
| "num_input_tokens_seen": 5796728, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.2653753026634385, | |
| "grad_norm": 1.8455132246017456, | |
| "learning_rate": 4.201447893630065e-06, | |
| "loss": 0.0268, | |
| "num_input_tokens_seen": 5822520, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.275060532687651, | |
| "grad_norm": 3.7571520805358887, | |
| "learning_rate": 4.096361346832681e-06, | |
| "loss": 0.0427, | |
| "num_input_tokens_seen": 5847768, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.2847457627118644, | |
| "grad_norm": 4.052141189575195, | |
| "learning_rate": 3.992397504799039e-06, | |
| "loss": 0.0363, | |
| "num_input_tokens_seen": 5873208, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.2944309927360775, | |
| "grad_norm": 2.814667224884033, | |
| "learning_rate": 3.889567072276827e-06, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 5897368, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.3041162227602907, | |
| "grad_norm": 0.680135190486908, | |
| "learning_rate": 3.78788063731103e-06, | |
| "loss": 0.0662, | |
| "num_input_tokens_seen": 5921656, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.3138014527845034, | |
| "grad_norm": 4.201208591461182, | |
| "learning_rate": 3.6873486701536814e-06, | |
| "loss": 0.0434, | |
| "num_input_tokens_seen": 5946328, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.3234866828087166, | |
| "grad_norm": 1.828552007675171, | |
| "learning_rate": 3.587981522185829e-06, | |
| "loss": 0.0425, | |
| "num_input_tokens_seen": 5971352, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.33317191283293, | |
| "grad_norm": 0.6704538464546204, | |
| "learning_rate": 3.4897894248516736e-06, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 5995544, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.342857142857143, | |
| "grad_norm": 2.377774238586426, | |
| "learning_rate": 3.3927824886050555e-06, | |
| "loss": 0.0499, | |
| "num_input_tokens_seen": 6020600, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.3525423728813557, | |
| "grad_norm": 0.2766050398349762, | |
| "learning_rate": 3.2969707018684657e-06, | |
| "loss": 0.021, | |
| "num_input_tokens_seen": 6045304, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.362227602905569, | |
| "grad_norm": 1.9754971265792847, | |
| "learning_rate": 3.202363930004536e-06, | |
| "loss": 0.0216, | |
| "num_input_tokens_seen": 6070776, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.371912832929782, | |
| "grad_norm": 6.165454387664795, | |
| "learning_rate": 3.1089719143002615e-06, | |
| "loss": 0.0431, | |
| "num_input_tokens_seen": 6095256, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.3815980629539952, | |
| "grad_norm": 2.579355001449585, | |
| "learning_rate": 3.016804270963994e-06, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 6120088, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.3912832929782084, | |
| "grad_norm": 1.1952487230300903, | |
| "learning_rate": 2.925870490135255e-06, | |
| "loss": 0.0349, | |
| "num_input_tokens_seen": 6144792, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.400968523002421, | |
| "grad_norm": 0.08051615208387375, | |
| "learning_rate": 2.8361799349076143e-06, | |
| "loss": 0.0251, | |
| "num_input_tokens_seen": 6169688, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.4106537530266343, | |
| "grad_norm": 3.1085357666015625, | |
| "learning_rate": 2.747741840364593e-06, | |
| "loss": 0.0634, | |
| "num_input_tokens_seen": 6194680, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.4203389830508475, | |
| "grad_norm": 1.2273328304290771, | |
| "learning_rate": 2.6605653126287555e-06, | |
| "loss": 0.0451, | |
| "num_input_tokens_seen": 6218712, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.4300242130750607, | |
| "grad_norm": 2.9415712356567383, | |
| "learning_rate": 2.5746593279241105e-06, | |
| "loss": 0.0395, | |
| "num_input_tokens_seen": 6243384, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.4397094430992734, | |
| "grad_norm": 0.24813522398471832, | |
| "learning_rate": 2.490032731651833e-06, | |
| "loss": 0.0537, | |
| "num_input_tokens_seen": 6267416, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.4493946731234866, | |
| "grad_norm": 1.5883897542953491, | |
| "learning_rate": 2.4066942374795205e-06, | |
| "loss": 0.0402, | |
| "num_input_tokens_seen": 6292696, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.4590799031476998, | |
| "grad_norm": 0.41333088278770447, | |
| "learning_rate": 2.324652426443962e-06, | |
| "loss": 0.0295, | |
| "num_input_tokens_seen": 6317208, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.468765133171913, | |
| "grad_norm": 3.1688761711120605, | |
| "learning_rate": 2.243915746067587e-06, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 6341688, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.478450363196126, | |
| "grad_norm": 0.7070954442024231, | |
| "learning_rate": 2.164492509488657e-06, | |
| "loss": 0.0443, | |
| "num_input_tokens_seen": 6366712, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.488135593220339, | |
| "grad_norm": 0.3987884819507599, | |
| "learning_rate": 2.086390894605288e-06, | |
| "loss": 0.0555, | |
| "num_input_tokens_seen": 6391256, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.497820823244552, | |
| "grad_norm": 1.7903181314468384, | |
| "learning_rate": 2.0096189432334194e-06, | |
| "loss": 0.054, | |
| "num_input_tokens_seen": 6416184, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.507506053268765, | |
| "grad_norm": 7.973659992218018, | |
| "learning_rate": 1.9341845602787733e-06, | |
| "loss": 0.075, | |
| "num_input_tokens_seen": 6441176, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.5171912832929784, | |
| "grad_norm": 2.1646482944488525, | |
| "learning_rate": 1.8600955129229009e-06, | |
| "loss": 0.0384, | |
| "num_input_tokens_seen": 6465688, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.526876513317191, | |
| "grad_norm": 0.9478936791419983, | |
| "learning_rate": 1.7873594298234557e-06, | |
| "loss": 0.038, | |
| "num_input_tokens_seen": 6490456, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.5365617433414043, | |
| "grad_norm": 0.5018621683120728, | |
| "learning_rate": 1.7159838003286848e-06, | |
| "loss": 0.0233, | |
| "num_input_tokens_seen": 6515704, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.5462469733656174, | |
| "grad_norm": 4.254843711853027, | |
| "learning_rate": 1.645975973706269e-06, | |
| "loss": 0.0634, | |
| "num_input_tokens_seen": 6540920, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.5559322033898306, | |
| "grad_norm": 0.3339782655239105, | |
| "learning_rate": 1.5773431583866227e-06, | |
| "loss": 0.0333, | |
| "num_input_tokens_seen": 6565880, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.565617433414044, | |
| "grad_norm": 2.9373421669006348, | |
| "learning_rate": 1.5100924212206534e-06, | |
| "loss": 0.0649, | |
| "num_input_tokens_seen": 6591000, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5753026634382565, | |
| "grad_norm": 1.637086033821106, | |
| "learning_rate": 1.44423068675212e-06, | |
| "loss": 0.0531, | |
| "num_input_tokens_seen": 6615800, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.5849878934624697, | |
| "grad_norm": 0.06637797504663467, | |
| "learning_rate": 1.3797647365046411e-06, | |
| "loss": 0.0426, | |
| "num_input_tokens_seen": 6639288, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.594673123486683, | |
| "grad_norm": 0.9268229603767395, | |
| "learning_rate": 1.3167012082834212e-06, | |
| "loss": 0.0368, | |
| "num_input_tokens_seen": 6664632, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.6043583535108956, | |
| "grad_norm": 4.011239528656006, | |
| "learning_rate": 1.2550465954917932e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 6689496, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.614043583535109, | |
| "grad_norm": 3.382112741470337, | |
| "learning_rate": 1.1948072464626102e-06, | |
| "loss": 0.0331, | |
| "num_input_tokens_seen": 6714552, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.623728813559322, | |
| "grad_norm": 5.245890140533447, | |
| "learning_rate": 1.1359893638045854e-06, | |
| "loss": 0.0226, | |
| "num_input_tokens_seen": 6739320, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.633414043583535, | |
| "grad_norm": 2.0806005001068115, | |
| "learning_rate": 1.0785990037636335e-06, | |
| "loss": 0.0611, | |
| "num_input_tokens_seen": 6763352, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.6430992736077483, | |
| "grad_norm": 2.040339469909668, | |
| "learning_rate": 1.022642075599286e-06, | |
| "loss": 0.0615, | |
| "num_input_tokens_seen": 6787544, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.6527845036319615, | |
| "grad_norm": 4.939095973968506, | |
| "learning_rate": 9.68124340976232e-07, | |
| "loss": 0.0393, | |
| "num_input_tokens_seen": 6812760, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.6624697336561742, | |
| "grad_norm": 0.7793028354644775, | |
| "learning_rate": 9.150514133710647e-07, | |
| "loss": 0.0656, | |
| "num_input_tokens_seen": 6838008, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.6721549636803874, | |
| "grad_norm": 0.568551778793335, | |
| "learning_rate": 8.634287574942834e-07, | |
| "loss": 0.0452, | |
| "num_input_tokens_seen": 6863320, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.6818401937046006, | |
| "grad_norm": 5.33021354675293, | |
| "learning_rate": 8.132616887276212e-07, | |
| "loss": 0.0404, | |
| "num_input_tokens_seen": 6888824, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.6915254237288133, | |
| "grad_norm": 4.118853569030762, | |
| "learning_rate": 7.645553725767229e-07, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 6913048, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.7012106537530265, | |
| "grad_norm": 1.218005895614624, | |
| "learning_rate": 7.173148241392957e-07, | |
| "loss": 0.0459, | |
| "num_input_tokens_seen": 6937432, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.7108958837772397, | |
| "grad_norm": 0.6871452927589417, | |
| "learning_rate": 6.71544907588712e-07, | |
| "loss": 0.0386, | |
| "num_input_tokens_seen": 6962584, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.720581113801453, | |
| "grad_norm": 2.3115310668945312, | |
| "learning_rate": 6.272503356731601e-07, | |
| "loss": 0.0714, | |
| "num_input_tokens_seen": 6987768, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.730266343825666, | |
| "grad_norm": 4.2863569259643555, | |
| "learning_rate": 5.84435669230401e-07, | |
| "loss": 0.0364, | |
| "num_input_tokens_seen": 7013336, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.739951573849879, | |
| "grad_norm": 0.879754900932312, | |
| "learning_rate": 5.431053167181515e-07, | |
| "loss": 0.0346, | |
| "num_input_tokens_seen": 7038648, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.749636803874092, | |
| "grad_norm": 1.9641544818878174, | |
| "learning_rate": 5.032635337601687e-07, | |
| "loss": 0.0337, | |
| "num_input_tokens_seen": 7064184, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.759322033898305, | |
| "grad_norm": 0.6523151993751526, | |
| "learning_rate": 4.6491442270805596e-07, | |
| "loss": 0.0229, | |
| "num_input_tokens_seen": 7089336, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.7690072639225183, | |
| "grad_norm": 0.46984636783599854, | |
| "learning_rate": 4.280619322188628e-07, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 7114072, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.778692493946731, | |
| "grad_norm": 2.178297519683838, | |
| "learning_rate": 3.9270985684851545e-07, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 7139576, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.788377723970944, | |
| "grad_norm": 3.751574993133545, | |
| "learning_rate": 3.588618366610941e-07, | |
| "loss": 0.0442, | |
| "num_input_tokens_seen": 7165432, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.7980629539951574, | |
| "grad_norm": 1.0459034442901611, | |
| "learning_rate": 3.2652135685403593e-07, | |
| "loss": 0.0324, | |
| "num_input_tokens_seen": 7190808, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.8077481840193705, | |
| "grad_norm": 3.6684751510620117, | |
| "learning_rate": 2.9569174739928096e-07, | |
| "loss": 0.0497, | |
| "num_input_tokens_seen": 7216440, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.8174334140435837, | |
| "grad_norm": 4.388014316558838, | |
| "learning_rate": 2.663761827003941e-07, | |
| "loss": 0.0404, | |
| "num_input_tokens_seen": 7243480, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.8271186440677964, | |
| "grad_norm": 6.251937389373779, | |
| "learning_rate": 2.38577681265707e-07, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 7268568, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.8368038740920096, | |
| "grad_norm": 2.676504611968994, | |
| "learning_rate": 2.122991053975215e-07, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 7293784, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.846489104116223, | |
| "grad_norm": 4.877316474914551, | |
| "learning_rate": 1.8754316089737878e-07, | |
| "loss": 0.0328, | |
| "num_input_tokens_seen": 7318680, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.856174334140436, | |
| "grad_norm": 1.454691767692566, | |
| "learning_rate": 1.6431239678746546e-07, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 7343864, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.8658595641646487, | |
| "grad_norm": 3.7415764331817627, | |
| "learning_rate": 1.4260920504814366e-07, | |
| "loss": 0.0649, | |
| "num_input_tokens_seen": 7370232, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.875544794188862, | |
| "grad_norm": 2.577986240386963, | |
| "learning_rate": 1.22435820371658e-07, | |
| "loss": 0.0462, | |
| "num_input_tokens_seen": 7394936, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.885230024213075, | |
| "grad_norm": 4.861838340759277, | |
| "learning_rate": 1.0379431993204458e-07, | |
| "loss": 0.0425, | |
| "num_input_tokens_seen": 7420088, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.8949152542372882, | |
| "grad_norm": 3.2706315517425537, | |
| "learning_rate": 8.668662317124043e-08, | |
| "loss": 0.0418, | |
| "num_input_tokens_seen": 7445048, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.9046004842615014, | |
| "grad_norm": 0.6351612210273743, | |
| "learning_rate": 7.111449160146333e-08, | |
| "loss": 0.022, | |
| "num_input_tokens_seen": 7469144, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.914285714285714, | |
| "grad_norm": 2.6043741703033447, | |
| "learning_rate": 5.7079528623816824e-08, | |
| "loss": 0.0529, | |
| "num_input_tokens_seen": 7493528, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.9239709443099273, | |
| "grad_norm": 0.3836284875869751, | |
| "learning_rate": 4.4583179363210656e-08, | |
| "loss": 0.0335, | |
| "num_input_tokens_seen": 7517560, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.9336561743341405, | |
| "grad_norm": 0.7341143488883972, | |
| "learning_rate": 3.3626730519551455e-08, | |
| "loss": 0.0338, | |
| "num_input_tokens_seen": 7542552, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.9433414043583537, | |
| "grad_norm": 2.3991236686706543, | |
| "learning_rate": 2.4211310235258687e-08, | |
| "loss": 0.0403, | |
| "num_input_tokens_seen": 7566968, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.9530266343825664, | |
| "grad_norm": 1.5679802894592285, | |
| "learning_rate": 1.633788797910929e-08, | |
| "loss": 0.0259, | |
| "num_input_tokens_seen": 7591672, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.9627118644067796, | |
| "grad_norm": 0.794366717338562, | |
| "learning_rate": 1.0007274446409143e-08, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 7616536, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.9723970944309928, | |
| "grad_norm": 1.9847420454025269, | |
| "learning_rate": 5.220121475519868e-09, | |
| "loss": 0.0487, | |
| "num_input_tokens_seen": 7640824, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.982082324455206, | |
| "grad_norm": 3.2490248680114746, | |
| "learning_rate": 1.976921980745838e-09, | |
| "loss": 0.056, | |
| "num_input_tokens_seen": 7666328, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.991767554479419, | |
| "grad_norm": 0.0969802513718605, | |
| "learning_rate": 2.780099015747828e-10, | |
| "loss": 0.0201, | |
| "num_input_tokens_seen": 7691224, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.9975786924939465, | |
| "num_input_tokens_seen": 7706072, | |
| "step": 3096, | |
| "total_flos": 3.298866475009966e+17, | |
| "train_loss": 0.08281170262038245, | |
| "train_runtime": 2763.8125, | |
| "train_samples_per_second": 8.963, | |
| "train_steps_per_second": 1.12 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3096, | |
| "num_input_tokens_seen": 7706072, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.298866475009966e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |