{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9975786924939465, "eval_steps": 500, "global_step": 3096, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009685230024213076, "grad_norm": 6.778852939605713, "learning_rate": 2.9999227754514262e-05, "loss": 0.8519, "num_input_tokens_seen": 25568, "step": 10 }, { "epoch": 0.01937046004842615, "grad_norm": 3.0029561519622803, "learning_rate": 2.9996911097572118e-05, "loss": 0.189, "num_input_tokens_seen": 51072, "step": 20 }, { "epoch": 0.029055690072639227, "grad_norm": 5.477710247039795, "learning_rate": 2.9993050267710624e-05, "loss": 0.1648, "num_input_tokens_seen": 76416, "step": 30 }, { "epoch": 0.0387409200968523, "grad_norm": 4.35634183883667, "learning_rate": 2.9987645662464235e-05, "loss": 0.1905, "num_input_tokens_seen": 101344, "step": 40 }, { "epoch": 0.048426150121065374, "grad_norm": 4.523565292358398, "learning_rate": 2.9980697838323884e-05, "loss": 0.1794, "num_input_tokens_seen": 126656, "step": 50 }, { "epoch": 0.05811138014527845, "grad_norm": 1.9348187446594238, "learning_rate": 2.9972207510679677e-05, "loss": 0.1528, "num_input_tokens_seen": 151200, "step": 60 }, { "epoch": 0.06779661016949153, "grad_norm": 2.981433629989624, "learning_rate": 2.996217555374725e-05, "loss": 0.1742, "num_input_tokens_seen": 175968, "step": 70 }, { "epoch": 0.0774818401937046, "grad_norm": 3.6294591426849365, "learning_rate": 2.9950603000477722e-05, "loss": 0.1565, "num_input_tokens_seen": 201280, "step": 80 }, { "epoch": 0.08716707021791767, "grad_norm": 2.5459301471710205, "learning_rate": 2.993749104245137e-05, "loss": 0.1499, "num_input_tokens_seen": 226432, "step": 90 }, { "epoch": 0.09685230024213075, "grad_norm": 2.2721059322357178, "learning_rate": 2.992284102975491e-05, "loss": 0.1441, "num_input_tokens_seen": 251744, "step": 100 }, { "epoch": 0.10653753026634383, "grad_norm": 2.0033624172210693, "learning_rate": 2.9906654470842492e-05, "loss": 0.1245, "num_input_tokens_seen": 276480, "step": 110 }, { "epoch": 0.1162227602905569, "grad_norm": 8.585118293762207, "learning_rate": 2.9888933032380397e-05, "loss": 0.1333, "num_input_tokens_seen": 301664, "step": 120 }, { "epoch": 0.12590799031476999, "grad_norm": 1.423967719078064, "learning_rate": 2.9869678539075403e-05, "loss": 0.1728, "num_input_tokens_seen": 326784, "step": 130 }, { "epoch": 0.13559322033898305, "grad_norm": 2.6306211948394775, "learning_rate": 2.9848892973486912e-05, "loss": 0.1281, "num_input_tokens_seen": 351328, "step": 140 }, { "epoch": 0.14527845036319612, "grad_norm": 2.5618090629577637, "learning_rate": 2.9826578475822825e-05, "loss": 0.1136, "num_input_tokens_seen": 376000, "step": 150 }, { "epoch": 0.1549636803874092, "grad_norm": 2.694077730178833, "learning_rate": 2.980273734371914e-05, "loss": 0.1277, "num_input_tokens_seen": 400384, "step": 160 }, { "epoch": 0.16464891041162227, "grad_norm": 2.632338047027588, "learning_rate": 2.9777372032003423e-05, "loss": 0.1028, "num_input_tokens_seen": 426432, "step": 170 }, { "epoch": 0.17433414043583534, "grad_norm": 2.3446829319000244, "learning_rate": 2.975048515244199e-05, "loss": 0.1245, "num_input_tokens_seen": 451712, "step": 180 }, { "epoch": 0.18401937046004843, "grad_norm": 1.8457319736480713, "learning_rate": 2.9722079473471035e-05, "loss": 0.142, "num_input_tokens_seen": 476960, "step": 190 }, { "epoch": 0.1937046004842615, "grad_norm": 1.8676010370254517, "learning_rate": 2.9692157919911536e-05, "loss": 0.1342, "num_input_tokens_seen": 501440, "step": 200 }, { "epoch": 0.2033898305084746, "grad_norm": 4.593673229217529, "learning_rate": 2.966072357266811e-05, "loss": 0.1314, "num_input_tokens_seen": 526656, "step": 210 }, { "epoch": 0.21307506053268765, "grad_norm": 3.9568676948547363, "learning_rate": 2.9627779668411795e-05, "loss": 0.171, "num_input_tokens_seen": 552544, "step": 220 }, { "epoch": 0.22276029055690072, "grad_norm": 2.4331846237182617, "learning_rate": 2.9593329599246766e-05, "loss": 0.115, "num_input_tokens_seen": 577472, "step": 230 }, { "epoch": 0.2324455205811138, "grad_norm": 2.525543212890625, "learning_rate": 2.955737691236108e-05, "loss": 0.1158, "num_input_tokens_seen": 601856, "step": 240 }, { "epoch": 0.24213075060532688, "grad_norm": 2.2355105876922607, "learning_rate": 2.9519925309661422e-05, "loss": 0.111, "num_input_tokens_seen": 627904, "step": 250 }, { "epoch": 0.25181598062953997, "grad_norm": 4.165389537811279, "learning_rate": 2.948097864739194e-05, "loss": 0.1314, "num_input_tokens_seen": 651936, "step": 260 }, { "epoch": 0.26150121065375304, "grad_norm": 3.1712851524353027, "learning_rate": 2.944054093573719e-05, "loss": 0.143, "num_input_tokens_seen": 676416, "step": 270 }, { "epoch": 0.2711864406779661, "grad_norm": 2.881716728210449, "learning_rate": 2.93986163384092e-05, "loss": 0.1121, "num_input_tokens_seen": 700832, "step": 280 }, { "epoch": 0.28087167070217917, "grad_norm": 3.060872793197632, "learning_rate": 2.9355209172218777e-05, "loss": 0.1159, "num_input_tokens_seen": 725824, "step": 290 }, { "epoch": 0.29055690072639223, "grad_norm": 4.449444770812988, "learning_rate": 2.931032390663101e-05, "loss": 0.133, "num_input_tokens_seen": 749408, "step": 300 }, { "epoch": 0.30024213075060535, "grad_norm": 5.323568344116211, "learning_rate": 2.926396516330506e-05, "loss": 0.1172, "num_input_tokens_seen": 773984, "step": 310 }, { "epoch": 0.3099273607748184, "grad_norm": 3.144500732421875, "learning_rate": 2.921613771561829e-05, "loss": 0.136, "num_input_tokens_seen": 799168, "step": 320 }, { "epoch": 0.3196125907990315, "grad_norm": 2.433586359024048, "learning_rate": 2.916684648817478e-05, "loss": 0.0973, "num_input_tokens_seen": 824320, "step": 330 }, { "epoch": 0.32929782082324455, "grad_norm": 3.349472761154175, "learning_rate": 2.9116096556298256e-05, "loss": 0.13, "num_input_tokens_seen": 849632, "step": 340 }, { "epoch": 0.3389830508474576, "grad_norm": 1.8927061557769775, "learning_rate": 2.9063893145509475e-05, "loss": 0.1257, "num_input_tokens_seen": 874400, "step": 350 }, { "epoch": 0.3486682808716707, "grad_norm": 3.972686529159546, "learning_rate": 2.901024163098822e-05, "loss": 0.1155, "num_input_tokens_seen": 899264, "step": 360 }, { "epoch": 0.3583535108958838, "grad_norm": 1.177282452583313, "learning_rate": 2.8955147537019815e-05, "loss": 0.1251, "num_input_tokens_seen": 924544, "step": 370 }, { "epoch": 0.36803874092009686, "grad_norm": 1.9911576509475708, "learning_rate": 2.88986165364263e-05, "loss": 0.1147, "num_input_tokens_seen": 949792, "step": 380 }, { "epoch": 0.37772397094430993, "grad_norm": 2.402615785598755, "learning_rate": 2.8840654449982344e-05, "loss": 0.1433, "num_input_tokens_seen": 974112, "step": 390 }, { "epoch": 0.387409200968523, "grad_norm": 1.3184998035430908, "learning_rate": 2.8781267245815898e-05, "loss": 0.1117, "num_input_tokens_seen": 999168, "step": 400 }, { "epoch": 0.39709443099273606, "grad_norm": 1.9284625053405762, "learning_rate": 2.8720461038793672e-05, "loss": 0.1353, "num_input_tokens_seen": 1024320, "step": 410 }, { "epoch": 0.4067796610169492, "grad_norm": 3.1020259857177734, "learning_rate": 2.8658242089891515e-05, "loss": 0.1165, "num_input_tokens_seen": 1049088, "step": 420 }, { "epoch": 0.41646489104116224, "grad_norm": 2.203179359436035, "learning_rate": 2.8594616805549752e-05, "loss": 0.1215, "num_input_tokens_seen": 1073632, "step": 430 }, { "epoch": 0.4261501210653753, "grad_norm": 2.053194522857666, "learning_rate": 2.8529591737013526e-05, "loss": 0.1066, "num_input_tokens_seen": 1098208, "step": 440 }, { "epoch": 0.4358353510895884, "grad_norm": 2.780935049057007, "learning_rate": 2.8463173579658258e-05, "loss": 0.0879, "num_input_tokens_seen": 1122336, "step": 450 }, { "epoch": 0.44552058111380144, "grad_norm": 1.9929611682891846, "learning_rate": 2.8395369172300235e-05, "loss": 0.1141, "num_input_tokens_seen": 1147392, "step": 460 }, { "epoch": 0.4552058111380145, "grad_norm": 1.1469779014587402, "learning_rate": 2.8326185496492464e-05, "loss": 0.1052, "num_input_tokens_seen": 1173248, "step": 470 }, { "epoch": 0.4648910411622276, "grad_norm": 2.501117706298828, "learning_rate": 2.825562967580579e-05, "loss": 0.1086, "num_input_tokens_seen": 1197984, "step": 480 }, { "epoch": 0.4745762711864407, "grad_norm": 2.0266308784484863, "learning_rate": 2.8183708975095406e-05, "loss": 0.1201, "num_input_tokens_seen": 1222720, "step": 490 }, { "epoch": 0.48426150121065376, "grad_norm": 1.1120251417160034, "learning_rate": 2.8110430799752845e-05, "loss": 0.1319, "num_input_tokens_seen": 1247232, "step": 500 }, { "epoch": 0.4939467312348668, "grad_norm": 1.2014496326446533, "learning_rate": 2.8035802694943457e-05, "loss": 0.1071, "num_input_tokens_seen": 1273184, "step": 510 }, { "epoch": 0.5036319612590799, "grad_norm": 1.1245245933532715, "learning_rate": 2.7959832344829512e-05, "loss": 0.1554, "num_input_tokens_seen": 1298688, "step": 520 }, { "epoch": 0.513317191283293, "grad_norm": 2.031115770339966, "learning_rate": 2.7882527571779003e-05, "loss": 0.1196, "num_input_tokens_seen": 1324128, "step": 530 }, { "epoch": 0.5230024213075061, "grad_norm": 1.7691289186477661, "learning_rate": 2.78038963355602e-05, "loss": 0.1334, "num_input_tokens_seen": 1349120, "step": 540 }, { "epoch": 0.5326876513317191, "grad_norm": 2.9496989250183105, "learning_rate": 2.7723946732522055e-05, "loss": 0.1109, "num_input_tokens_seen": 1374304, "step": 550 }, { "epoch": 0.5423728813559322, "grad_norm": 2.2881715297698975, "learning_rate": 2.764268699476058e-05, "loss": 0.1274, "num_input_tokens_seen": 1399136, "step": 560 }, { "epoch": 0.5520581113801453, "grad_norm": 1.9754095077514648, "learning_rate": 2.756012548927119e-05, "loss": 0.1397, "num_input_tokens_seen": 1424672, "step": 570 }, { "epoch": 0.5617433414043583, "grad_norm": 1.9883428812026978, "learning_rate": 2.7476270717087215e-05, "loss": 0.101, "num_input_tokens_seen": 1449024, "step": 580 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9653130769729614, "learning_rate": 2.7391131312404556e-05, "loss": 0.0941, "num_input_tokens_seen": 1475264, "step": 590 }, { "epoch": 0.5811138014527845, "grad_norm": 4.576601028442383, "learning_rate": 2.7304716041692663e-05, "loss": 0.0865, "num_input_tokens_seen": 1500064, "step": 600 }, { "epoch": 0.5907990314769975, "grad_norm": 2.4046311378479004, "learning_rate": 2.7217033802791906e-05, "loss": 0.1596, "num_input_tokens_seen": 1524448, "step": 610 }, { "epoch": 0.6004842615012107, "grad_norm": 1.7785555124282837, "learning_rate": 2.7128093623997368e-05, "loss": 0.0891, "num_input_tokens_seen": 1549536, "step": 620 }, { "epoch": 0.6101694915254238, "grad_norm": 2.2736170291900635, "learning_rate": 2.7037904663129262e-05, "loss": 0.1085, "num_input_tokens_seen": 1573408, "step": 630 }, { "epoch": 0.6198547215496368, "grad_norm": 1.0862345695495605, "learning_rate": 2.6946476206589972e-05, "loss": 0.1023, "num_input_tokens_seen": 1597888, "step": 640 }, { "epoch": 0.6295399515738499, "grad_norm": 0.5358290672302246, "learning_rate": 2.6853817668407875e-05, "loss": 0.0669, "num_input_tokens_seen": 1623296, "step": 650 }, { "epoch": 0.639225181598063, "grad_norm": 2.3138749599456787, "learning_rate": 2.6759938589268023e-05, "loss": 0.1017, "num_input_tokens_seen": 1649216, "step": 660 }, { "epoch": 0.648910411622276, "grad_norm": 3.2054226398468018, "learning_rate": 2.6664848635529742e-05, "loss": 0.1432, "num_input_tokens_seen": 1673760, "step": 670 }, { "epoch": 0.6585956416464891, "grad_norm": 1.8352829217910767, "learning_rate": 2.6568557598231385e-05, "loss": 0.1081, "num_input_tokens_seen": 1698592, "step": 680 }, { "epoch": 0.6682808716707022, "grad_norm": 1.203284740447998, "learning_rate": 2.6471075392082125e-05, "loss": 0.1037, "num_input_tokens_seen": 1723296, "step": 690 }, { "epoch": 0.6779661016949152, "grad_norm": 1.635628581047058, "learning_rate": 2.6372412054441116e-05, "loss": 0.1216, "num_input_tokens_seen": 1748384, "step": 700 }, { "epoch": 0.6876513317191283, "grad_norm": 0.8993457555770874, "learning_rate": 2.6272577744283965e-05, "loss": 0.0853, "num_input_tokens_seen": 1773600, "step": 710 }, { "epoch": 0.6973365617433414, "grad_norm": 1.7306419610977173, "learning_rate": 2.617158274115673e-05, "loss": 0.1034, "num_input_tokens_seen": 1798656, "step": 720 }, { "epoch": 0.7070217917675545, "grad_norm": 2.770066976547241, "learning_rate": 2.6069437444117432e-05, "loss": 0.0872, "num_input_tokens_seen": 1824544, "step": 730 }, { "epoch": 0.7167070217917676, "grad_norm": 2.3590221405029297, "learning_rate": 2.596615237066535e-05, "loss": 0.1063, "num_input_tokens_seen": 1848896, "step": 740 }, { "epoch": 0.7263922518159807, "grad_norm": 1.0496519804000854, "learning_rate": 2.586173815565805e-05, "loss": 0.1104, "num_input_tokens_seen": 1873248, "step": 750 }, { "epoch": 0.7360774818401937, "grad_norm": 1.513573408126831, "learning_rate": 2.575620555021634e-05, "loss": 0.1125, "num_input_tokens_seen": 1897184, "step": 760 }, { "epoch": 0.7457627118644068, "grad_norm": 1.5545728206634521, "learning_rate": 2.564956542061732e-05, "loss": 0.0969, "num_input_tokens_seen": 1922368, "step": 770 }, { "epoch": 0.7554479418886199, "grad_norm": 1.9260263442993164, "learning_rate": 2.5541828747175477e-05, "loss": 0.1142, "num_input_tokens_seen": 1947904, "step": 780 }, { "epoch": 0.7651331719128329, "grad_norm": 2.396538734436035, "learning_rate": 2.543300662311211e-05, "loss": 0.0926, "num_input_tokens_seen": 1971872, "step": 790 }, { "epoch": 0.774818401937046, "grad_norm": 1.7069965600967407, "learning_rate": 2.532311025341309e-05, "loss": 0.0802, "num_input_tokens_seen": 1996352, "step": 800 }, { "epoch": 0.784503631961259, "grad_norm": 5.540910243988037, "learning_rate": 2.5212150953675133e-05, "loss": 0.1248, "num_input_tokens_seen": 2020480, "step": 810 }, { "epoch": 0.7941888619854721, "grad_norm": 1.7795952558517456, "learning_rate": 2.5100140148940688e-05, "loss": 0.0767, "num_input_tokens_seen": 2044448, "step": 820 }, { "epoch": 0.8038740920096852, "grad_norm": 2.7387983798980713, "learning_rate": 2.498708937252153e-05, "loss": 0.1239, "num_input_tokens_seen": 2070400, "step": 830 }, { "epoch": 0.8135593220338984, "grad_norm": 2.1243462562561035, "learning_rate": 2.4873010264811222e-05, "loss": 0.108, "num_input_tokens_seen": 2095392, "step": 840 }, { "epoch": 0.8232445520581114, "grad_norm": 0.9928631782531738, "learning_rate": 2.4757914572086555e-05, "loss": 0.0994, "num_input_tokens_seen": 2120192, "step": 850 }, { "epoch": 0.8329297820823245, "grad_norm": 6.047460556030273, "learning_rate": 2.464181414529809e-05, "loss": 0.0927, "num_input_tokens_seen": 2144384, "step": 860 }, { "epoch": 0.8426150121065376, "grad_norm": 2.2197115421295166, "learning_rate": 2.4524720938849883e-05, "loss": 0.1328, "num_input_tokens_seen": 2168704, "step": 870 }, { "epoch": 0.8523002421307506, "grad_norm": 2.0752601623535156, "learning_rate": 2.440664700936861e-05, "loss": 0.1229, "num_input_tokens_seen": 2193248, "step": 880 }, { "epoch": 0.8619854721549637, "grad_norm": 1.00425386428833, "learning_rate": 2.4287604514462152e-05, "loss": 0.0957, "num_input_tokens_seen": 2217568, "step": 890 }, { "epoch": 0.8716707021791767, "grad_norm": 1.9153094291687012, "learning_rate": 2.416760571146774e-05, "loss": 0.0975, "num_input_tokens_seen": 2242048, "step": 900 }, { "epoch": 0.8813559322033898, "grad_norm": 2.3558013439178467, "learning_rate": 2.4046662956189898e-05, "loss": 0.1068, "num_input_tokens_seen": 2266112, "step": 910 }, { "epoch": 0.8910411622276029, "grad_norm": 2.546351909637451, "learning_rate": 2.3924788701628197e-05, "loss": 0.0688, "num_input_tokens_seen": 2290720, "step": 920 }, { "epoch": 0.9007263922518159, "grad_norm": 1.2526168823242188, "learning_rate": 2.3801995496695028e-05, "loss": 0.1141, "num_input_tokens_seen": 2315488, "step": 930 }, { "epoch": 0.910411622276029, "grad_norm": 2.134089231491089, "learning_rate": 2.367829598492348e-05, "loss": 0.1328, "num_input_tokens_seen": 2340992, "step": 940 }, { "epoch": 0.9200968523002422, "grad_norm": 1.332915186882019, "learning_rate": 2.3553702903165502e-05, "loss": 0.1, "num_input_tokens_seen": 2366880, "step": 950 }, { "epoch": 0.9297820823244553, "grad_norm": 1.5140970945358276, "learning_rate": 2.3428229080280407e-05, "loss": 0.1089, "num_input_tokens_seen": 2392000, "step": 960 }, { "epoch": 0.9394673123486683, "grad_norm": 1.531954288482666, "learning_rate": 2.330188743581398e-05, "loss": 0.0924, "num_input_tokens_seen": 2417472, "step": 970 }, { "epoch": 0.9491525423728814, "grad_norm": 1.3347736597061157, "learning_rate": 2.3174690978668155e-05, "loss": 0.1205, "num_input_tokens_seen": 2442496, "step": 980 }, { "epoch": 0.9588377723970944, "grad_norm": 3.1497702598571777, "learning_rate": 2.3046652805761588e-05, "loss": 0.1004, "num_input_tokens_seen": 2467392, "step": 990 }, { "epoch": 0.9685230024213075, "grad_norm": 1.6756023168563843, "learning_rate": 2.2917786100681078e-05, "loss": 0.1007, "num_input_tokens_seen": 2492768, "step": 1000 }, { "epoch": 0.9782082324455206, "grad_norm": 2.56594181060791, "learning_rate": 2.2788104132324125e-05, "loss": 0.1179, "num_input_tokens_seen": 2518176, "step": 1010 }, { "epoch": 0.9878934624697336, "grad_norm": 2.1090595722198486, "learning_rate": 2.2657620253532685e-05, "loss": 0.0971, "num_input_tokens_seen": 2543296, "step": 1020 }, { "epoch": 0.9975786924939467, "grad_norm": 0.41959595680236816, "learning_rate": 2.252634789971827e-05, "loss": 0.0932, "num_input_tokens_seen": 2567680, "step": 1030 }, { "epoch": 1.006779661016949, "grad_norm": 1.6389803886413574, "learning_rate": 2.2394300587478566e-05, "loss": 0.0924, "num_input_tokens_seen": 2591016, "step": 1040 }, { "epoch": 1.0164648910411622, "grad_norm": 1.4045557975769043, "learning_rate": 2.2261491913205684e-05, "loss": 0.0985, "num_input_tokens_seen": 2615752, "step": 1050 }, { "epoch": 1.0261501210653754, "grad_norm": 2.0734925270080566, "learning_rate": 2.212793555168617e-05, "loss": 0.0853, "num_input_tokens_seen": 2640200, "step": 1060 }, { "epoch": 1.0358353510895884, "grad_norm": 2.1590147018432617, "learning_rate": 2.1993645254692994e-05, "loss": 0.116, "num_input_tokens_seen": 2665416, "step": 1070 }, { "epoch": 1.0455205811138015, "grad_norm": 1.739646553993225, "learning_rate": 2.1858634849569578e-05, "loss": 0.0972, "num_input_tokens_seen": 2690376, "step": 1080 }, { "epoch": 1.0552058111380145, "grad_norm": 0.6458954215049744, "learning_rate": 2.1722918237806042e-05, "loss": 0.0884, "num_input_tokens_seen": 2715080, "step": 1090 }, { "epoch": 1.0648910411622277, "grad_norm": 2.2830138206481934, "learning_rate": 2.158650939360782e-05, "loss": 0.073, "num_input_tokens_seen": 2740424, "step": 1100 }, { "epoch": 1.0745762711864406, "grad_norm": 1.5225194692611694, "learning_rate": 2.1449422362456794e-05, "loss": 0.0813, "num_input_tokens_seen": 2765640, "step": 1110 }, { "epoch": 1.0842615012106538, "grad_norm": 1.683604121208191, "learning_rate": 2.13116712596651e-05, "loss": 0.0953, "num_input_tokens_seen": 2791176, "step": 1120 }, { "epoch": 1.0939467312348667, "grad_norm": 1.5679166316986084, "learning_rate": 2.1173270268921703e-05, "loss": 0.0933, "num_input_tokens_seen": 2816072, "step": 1130 }, { "epoch": 1.10363196125908, "grad_norm": 1.3097947835922241, "learning_rate": 2.1034233640831988e-05, "loss": 0.0819, "num_input_tokens_seen": 2840776, "step": 1140 }, { "epoch": 1.113317191283293, "grad_norm": 0.5728388428688049, "learning_rate": 2.0894575691450396e-05, "loss": 0.0611, "num_input_tokens_seen": 2865416, "step": 1150 }, { "epoch": 1.123002421307506, "grad_norm": 2.3043558597564697, "learning_rate": 2.0754310800806395e-05, "loss": 0.0748, "num_input_tokens_seen": 2890248, "step": 1160 }, { "epoch": 1.1326876513317192, "grad_norm": 1.2087112665176392, "learning_rate": 2.0613453411423797e-05, "loss": 0.0959, "num_input_tokens_seen": 2916392, "step": 1170 }, { "epoch": 1.1423728813559322, "grad_norm": 1.5639240741729736, "learning_rate": 2.0472018026833684e-05, "loss": 0.0709, "num_input_tokens_seen": 2941160, "step": 1180 }, { "epoch": 1.1520581113801454, "grad_norm": 0.5889459848403931, "learning_rate": 2.0330019210081022e-05, "loss": 0.0731, "num_input_tokens_seen": 2966120, "step": 1190 }, { "epoch": 1.1617433414043583, "grad_norm": 1.854230523109436, "learning_rate": 2.0187471582225173e-05, "loss": 0.1005, "num_input_tokens_seen": 2990088, "step": 1200 }, { "epoch": 1.1714285714285715, "grad_norm": 2.01247239112854, "learning_rate": 2.004438982083442e-05, "loss": 0.0579, "num_input_tokens_seen": 3015400, "step": 1210 }, { "epoch": 1.1811138014527844, "grad_norm": 2.292900323867798, "learning_rate": 1.9900788658474677e-05, "loss": 0.0792, "num_input_tokens_seen": 3039464, "step": 1220 }, { "epoch": 1.1907990314769976, "grad_norm": 1.4194159507751465, "learning_rate": 1.975668288119252e-05, "loss": 0.057, "num_input_tokens_seen": 3063816, "step": 1230 }, { "epoch": 1.2004842615012106, "grad_norm": 1.0512489080429077, "learning_rate": 1.961208732699275e-05, "loss": 0.102, "num_input_tokens_seen": 3088968, "step": 1240 }, { "epoch": 1.2101694915254237, "grad_norm": 0.9465106129646301, "learning_rate": 1.9467016884310565e-05, "loss": 0.0691, "num_input_tokens_seen": 3113736, "step": 1250 }, { "epoch": 1.2198547215496367, "grad_norm": 1.274294376373291, "learning_rate": 1.9321486490478565e-05, "loss": 0.0668, "num_input_tokens_seen": 3138344, "step": 1260 }, { "epoch": 1.2295399515738499, "grad_norm": 1.9390579462051392, "learning_rate": 1.91755111301887e-05, "loss": 0.0711, "num_input_tokens_seen": 3163496, "step": 1270 }, { "epoch": 1.239225181598063, "grad_norm": 1.2855744361877441, "learning_rate": 1.902910583394938e-05, "loss": 0.0605, "num_input_tokens_seen": 3188392, "step": 1280 }, { "epoch": 1.248910411622276, "grad_norm": 2.931248188018799, "learning_rate": 1.888228567653781e-05, "loss": 0.0448, "num_input_tokens_seen": 3213224, "step": 1290 }, { "epoch": 1.2585956416464892, "grad_norm": 1.9991300106048584, "learning_rate": 1.873506577544784e-05, "loss": 0.0815, "num_input_tokens_seen": 3238568, "step": 1300 }, { "epoch": 1.2682808716707021, "grad_norm": 1.3530927896499634, "learning_rate": 1.8587461289333327e-05, "loss": 0.1043, "num_input_tokens_seen": 3264264, "step": 1310 }, { "epoch": 1.2779661016949153, "grad_norm": 2.07991099357605, "learning_rate": 1.8439487416447353e-05, "loss": 0.1037, "num_input_tokens_seen": 3288840, "step": 1320 }, { "epoch": 1.2876513317191283, "grad_norm": 1.8533947467803955, "learning_rate": 1.8291159393077294e-05, "loss": 0.0928, "num_input_tokens_seen": 3313832, "step": 1330 }, { "epoch": 1.2973365617433414, "grad_norm": 1.118119716644287, "learning_rate": 1.814249249197602e-05, "loss": 0.0775, "num_input_tokens_seen": 3337736, "step": 1340 }, { "epoch": 1.3070217917675544, "grad_norm": 2.740079641342163, "learning_rate": 1.7993502020789294e-05, "loss": 0.0521, "num_input_tokens_seen": 3362024, "step": 1350 }, { "epoch": 1.3167070217917676, "grad_norm": 1.9268351793289185, "learning_rate": 1.7844203320479614e-05, "loss": 0.0687, "num_input_tokens_seen": 3387496, "step": 1360 }, { "epoch": 1.3263922518159807, "grad_norm": 2.3576388359069824, "learning_rate": 1.7694611763746632e-05, "loss": 0.0704, "num_input_tokens_seen": 3412072, "step": 1370 }, { "epoch": 1.3360774818401937, "grad_norm": 1.127432942390442, "learning_rate": 1.754474275344427e-05, "loss": 0.0826, "num_input_tokens_seen": 3437096, "step": 1380 }, { "epoch": 1.3457627118644067, "grad_norm": 4.377537250518799, "learning_rate": 1.7394611720994747e-05, "loss": 0.0445, "num_input_tokens_seen": 3462120, "step": 1390 }, { "epoch": 1.3554479418886198, "grad_norm": 2.1285200119018555, "learning_rate": 1.724423412479967e-05, "loss": 0.0951, "num_input_tokens_seen": 3486952, "step": 1400 }, { "epoch": 1.365133171912833, "grad_norm": 0.16216270625591278, "learning_rate": 1.7093625448648348e-05, "loss": 0.0539, "num_input_tokens_seen": 3512264, "step": 1410 }, { "epoch": 1.374818401937046, "grad_norm": 2.1299915313720703, "learning_rate": 1.694280120012349e-05, "loss": 0.0848, "num_input_tokens_seen": 3537192, "step": 1420 }, { "epoch": 1.3845036319612591, "grad_norm": 2.476757049560547, "learning_rate": 1.6791776909004434e-05, "loss": 0.0629, "num_input_tokens_seen": 3560872, "step": 1430 }, { "epoch": 1.394188861985472, "grad_norm": 0.4373377561569214, "learning_rate": 1.664056812566812e-05, "loss": 0.079, "num_input_tokens_seen": 3586216, "step": 1440 }, { "epoch": 1.4038740920096853, "grad_norm": 1.9471170902252197, "learning_rate": 1.648919041948792e-05, "loss": 0.0798, "num_input_tokens_seen": 3610792, "step": 1450 }, { "epoch": 1.4135593220338982, "grad_norm": 2.911750316619873, "learning_rate": 1.6337659377230544e-05, "loss": 0.0897, "num_input_tokens_seen": 3634760, "step": 1460 }, { "epoch": 1.4232445520581114, "grad_norm": 2.9474802017211914, "learning_rate": 1.61859906014511e-05, "loss": 0.0858, "num_input_tokens_seen": 3659560, "step": 1470 }, { "epoch": 1.4329297820823244, "grad_norm": 0.6501768827438354, "learning_rate": 1.6034199708886573e-05, "loss": 0.0532, "num_input_tokens_seen": 3684840, "step": 1480 }, { "epoch": 1.4426150121065375, "grad_norm": 1.6708017587661743, "learning_rate": 1.5882302328847847e-05, "loss": 0.0842, "num_input_tokens_seen": 3709096, "step": 1490 }, { "epoch": 1.4523002421307507, "grad_norm": 1.5014967918395996, "learning_rate": 1.5730314101610376e-05, "loss": 0.0367, "num_input_tokens_seen": 3734728, "step": 1500 }, { "epoch": 1.4619854721549637, "grad_norm": 3.2587804794311523, "learning_rate": 1.5578250676803824e-05, "loss": 0.1085, "num_input_tokens_seen": 3758984, "step": 1510 }, { "epoch": 1.4716707021791768, "grad_norm": 6.304242134094238, "learning_rate": 1.5426127711800636e-05, "loss": 0.0712, "num_input_tokens_seen": 3784296, "step": 1520 }, { "epoch": 1.4813559322033898, "grad_norm": 1.1681016683578491, "learning_rate": 1.5273960870103872e-05, "loss": 0.0705, "num_input_tokens_seen": 3809768, "step": 1530 }, { "epoch": 1.491041162227603, "grad_norm": 1.111617922782898, "learning_rate": 1.5121765819734418e-05, "loss": 0.071, "num_input_tokens_seen": 3834536, "step": 1540 }, { "epoch": 1.5007263922518161, "grad_norm": 1.7780523300170898, "learning_rate": 1.4969558231617681e-05, "loss": 0.0648, "num_input_tokens_seen": 3858792, "step": 1550 }, { "epoch": 1.510411622276029, "grad_norm": 2.2017934322357178, "learning_rate": 1.4817353777970038e-05, "loss": 0.0633, "num_input_tokens_seen": 3883976, "step": 1560 }, { "epoch": 1.520096852300242, "grad_norm": 1.8567978143692017, "learning_rate": 1.466516813068512e-05, "loss": 0.0726, "num_input_tokens_seen": 3908392, "step": 1570 }, { "epoch": 1.5297820823244552, "grad_norm": 2.567291021347046, "learning_rate": 1.451301695972015e-05, "loss": 0.0882, "num_input_tokens_seen": 3932552, "step": 1580 }, { "epoch": 1.5394673123486684, "grad_norm": 1.9968935251235962, "learning_rate": 1.436091593148244e-05, "loss": 0.1149, "num_input_tokens_seen": 3957672, "step": 1590 }, { "epoch": 1.5491525423728814, "grad_norm": 1.9058917760849, "learning_rate": 1.4208880707216323e-05, "loss": 0.0841, "num_input_tokens_seen": 3982824, "step": 1600 }, { "epoch": 1.5588377723970943, "grad_norm": 1.9218000173568726, "learning_rate": 1.405692694139054e-05, "loss": 0.0896, "num_input_tokens_seen": 4008072, "step": 1610 }, { "epoch": 1.5685230024213075, "grad_norm": 1.5786553621292114, "learning_rate": 1.3905070280086387e-05, "loss": 0.0629, "num_input_tokens_seen": 4033096, "step": 1620 }, { "epoch": 1.5782082324455207, "grad_norm": 2.503990888595581, "learning_rate": 1.3753326359386695e-05, "loss": 0.077, "num_input_tokens_seen": 4058120, "step": 1630 }, { "epoch": 1.5878934624697336, "grad_norm": 1.5616143941879272, "learning_rate": 1.3601710803765814e-05, "loss": 0.0853, "num_input_tokens_seen": 4082792, "step": 1640 }, { "epoch": 1.5975786924939466, "grad_norm": 1.2533211708068848, "learning_rate": 1.3450239224480884e-05, "loss": 0.0605, "num_input_tokens_seen": 4107336, "step": 1650 }, { "epoch": 1.6072639225181597, "grad_norm": 1.1046490669250488, "learning_rate": 1.329892721796433e-05, "loss": 0.0985, "num_input_tokens_seen": 4132456, "step": 1660 }, { "epoch": 1.616949152542373, "grad_norm": 1.143494725227356, "learning_rate": 1.314779036421802e-05, "loss": 0.0547, "num_input_tokens_seen": 4156584, "step": 1670 }, { "epoch": 1.626634382566586, "grad_norm": 2.6082706451416016, "learning_rate": 1.2996844225209033e-05, "loss": 0.0919, "num_input_tokens_seen": 4181448, "step": 1680 }, { "epoch": 1.636319612590799, "grad_norm": 2.4191458225250244, "learning_rate": 1.2846104343267283e-05, "loss": 0.1204, "num_input_tokens_seen": 4207560, "step": 1690 }, { "epoch": 1.646004842615012, "grad_norm": 2.051799774169922, "learning_rate": 1.2695586239485223e-05, "loss": 0.0664, "num_input_tokens_seen": 4232040, "step": 1700 }, { "epoch": 1.6556900726392252, "grad_norm": 1.525844931602478, "learning_rate": 1.254530541211968e-05, "loss": 0.0805, "num_input_tokens_seen": 4257576, "step": 1710 }, { "epoch": 1.6653753026634384, "grad_norm": 0.9474373459815979, "learning_rate": 1.2395277334996045e-05, "loss": 0.073, "num_input_tokens_seen": 4282472, "step": 1720 }, { "epoch": 1.6750605326876513, "grad_norm": 1.8932424783706665, "learning_rate": 1.2245517455915036e-05, "loss": 0.0734, "num_input_tokens_seen": 4306792, "step": 1730 }, { "epoch": 1.6847457627118643, "grad_norm": 1.9888746738433838, "learning_rate": 1.2096041195062051e-05, "loss": 0.0831, "num_input_tokens_seen": 4333384, "step": 1740 }, { "epoch": 1.6944309927360774, "grad_norm": 1.8355742692947388, "learning_rate": 1.1946863943419452e-05, "loss": 0.0691, "num_input_tokens_seen": 4358344, "step": 1750 }, { "epoch": 1.7041162227602906, "grad_norm": 2.8447251319885254, "learning_rate": 1.1798001061181799e-05, "loss": 0.0988, "num_input_tokens_seen": 4381768, "step": 1760 }, { "epoch": 1.7138014527845038, "grad_norm": 2.670257806777954, "learning_rate": 1.1649467876174252e-05, "loss": 0.0936, "num_input_tokens_seen": 4405192, "step": 1770 }, { "epoch": 1.7234866828087168, "grad_norm": 1.188839077949524, "learning_rate": 1.1501279682274368e-05, "loss": 0.0901, "num_input_tokens_seen": 4430344, "step": 1780 }, { "epoch": 1.7331719128329297, "grad_norm": 2.494746685028076, "learning_rate": 1.1353451737837312e-05, "loss": 0.0691, "num_input_tokens_seen": 4455336, "step": 1790 }, { "epoch": 1.7428571428571429, "grad_norm": 1.3223942518234253, "learning_rate": 1.1205999264124788e-05, "loss": 0.0668, "num_input_tokens_seen": 4480648, "step": 1800 }, { "epoch": 1.752542372881356, "grad_norm": 1.3812003135681152, "learning_rate": 1.105893744373776e-05, "loss": 0.0788, "num_input_tokens_seen": 4506600, "step": 1810 }, { "epoch": 1.762227602905569, "grad_norm": 0.7805346250534058, "learning_rate": 1.0912281419053139e-05, "loss": 0.0723, "num_input_tokens_seen": 4531368, "step": 1820 }, { "epoch": 1.771912832929782, "grad_norm": 1.105878472328186, "learning_rate": 1.0766046290664662e-05, "loss": 0.0779, "num_input_tokens_seen": 4555272, "step": 1830 }, { "epoch": 1.7815980629539951, "grad_norm": 1.8672295808792114, "learning_rate": 1.0620247115828044e-05, "loss": 0.0838, "num_input_tokens_seen": 4580328, "step": 1840 }, { "epoch": 1.7912832929782083, "grad_norm": 1.844306468963623, "learning_rate": 1.047489890691055e-05, "loss": 0.0594, "num_input_tokens_seen": 4605768, "step": 1850 }, { "epoch": 1.8009685230024213, "grad_norm": 1.2717005014419556, "learning_rate": 1.0330016629845276e-05, "loss": 0.04, "num_input_tokens_seen": 4631048, "step": 1860 }, { "epoch": 1.8106537530266342, "grad_norm": 3.5843582153320312, "learning_rate": 1.0185615202590144e-05, "loss": 0.084, "num_input_tokens_seen": 4656456, "step": 1870 }, { "epoch": 1.8203389830508474, "grad_norm": 4.254288673400879, "learning_rate": 1.004170949359187e-05, "loss": 0.0654, "num_input_tokens_seen": 4681384, "step": 1880 }, { "epoch": 1.8300242130750606, "grad_norm": 1.351646065711975, "learning_rate": 9.89831432025501e-06, "loss": 0.0712, "num_input_tokens_seen": 4706216, "step": 1890 }, { "epoch": 1.8397094430992738, "grad_norm": 1.9015384912490845, "learning_rate": 9.755444447416255e-06, "loss": 0.0829, "num_input_tokens_seen": 4730984, "step": 1900 }, { "epoch": 1.8493946731234867, "grad_norm": 1.3803085088729858, "learning_rate": 9.613114585824196e-06, "loss": 0.0532, "num_input_tokens_seen": 4755112, "step": 1910 }, { "epoch": 1.8590799031476997, "grad_norm": 6.487275123596191, "learning_rate": 9.471339390624574e-06, "loss": 0.0781, "num_input_tokens_seen": 4780232, "step": 1920 }, { "epoch": 1.8687651331719128, "grad_norm": 2.182865619659424, "learning_rate": 9.330133459851323e-06, "loss": 0.0908, "num_input_tokens_seen": 4805192, "step": 1930 }, { "epoch": 1.878450363196126, "grad_norm": 0.42010384798049927, "learning_rate": 9.189511332923463e-06, "loss": 0.0398, "num_input_tokens_seen": 4830856, "step": 1940 }, { "epoch": 1.888135593220339, "grad_norm": 1.609157919883728, "learning_rate": 9.049487489148008e-06, "loss": 0.0912, "num_input_tokens_seen": 4855656, "step": 1950 }, { "epoch": 1.897820823244552, "grad_norm": 2.4291250705718994, "learning_rate": 8.910076346229134e-06, "loss": 0.0746, "num_input_tokens_seen": 4880392, "step": 1960 }, { "epoch": 1.907506053268765, "grad_norm": 2.243717670440674, "learning_rate": 8.77129225878361e-06, "loss": 0.1066, "num_input_tokens_seen": 4905320, "step": 1970 }, { "epoch": 1.9171912832929783, "grad_norm": 2.145559072494507, "learning_rate": 8.633149516862777e-06, "loss": 0.0839, "num_input_tokens_seen": 4930536, "step": 1980 }, { "epoch": 1.9268765133171912, "grad_norm": 0.6746326088905334, "learning_rate": 8.495662344481135e-06, "loss": 0.0527, "num_input_tokens_seen": 4956168, "step": 1990 }, { "epoch": 1.9365617433414044, "grad_norm": 1.293521761894226, "learning_rate": 8.358844898151791e-06, "loss": 0.1033, "num_input_tokens_seen": 4980584, "step": 2000 }, { "epoch": 1.9462469733656174, "grad_norm": 1.7922570705413818, "learning_rate": 8.222711265428779e-06, "loss": 0.079, "num_input_tokens_seen": 5005992, "step": 2010 }, { "epoch": 1.9559322033898305, "grad_norm": 1.0770626068115234, "learning_rate": 8.087275463456548e-06, "loss": 0.0652, "num_input_tokens_seen": 5032168, "step": 2020 }, { "epoch": 1.9656174334140437, "grad_norm": 0.7968271374702454, "learning_rate": 7.952551437526648e-06, "loss": 0.0593, "num_input_tokens_seen": 5056296, "step": 2030 }, { "epoch": 1.9753026634382567, "grad_norm": 2.140667676925659, "learning_rate": 7.818553059641868e-06, "loss": 0.0933, "num_input_tokens_seen": 5080424, "step": 2040 }, { "epoch": 1.9849878934624696, "grad_norm": 2.905066967010498, "learning_rate": 7.685294127087852e-06, "loss": 0.059, "num_input_tokens_seen": 5104904, "step": 2050 }, { "epoch": 1.9946731234866828, "grad_norm": 2.5095653533935547, "learning_rate": 7.552788361012486e-06, "loss": 0.0766, "num_input_tokens_seen": 5129064, "step": 2060 }, { "epoch": 2.0038740920096854, "grad_norm": 1.0241445302963257, "learning_rate": 7.421049405013061e-06, "loss": 0.0637, "num_input_tokens_seen": 5152120, "step": 2070 }, { "epoch": 2.013559322033898, "grad_norm": 1.7620762586593628, "learning_rate": 7.290090823731452e-06, "loss": 0.0419, "num_input_tokens_seen": 5176728, "step": 2080 }, { "epoch": 2.0232445520581113, "grad_norm": 1.1471503973007202, "learning_rate": 7.159926101457423e-06, "loss": 0.0586, "num_input_tokens_seen": 5201176, "step": 2090 }, { "epoch": 2.0329297820823244, "grad_norm": 1.4868978261947632, "learning_rate": 7.030568640740202e-06, "loss": 0.0382, "num_input_tokens_seen": 5225368, "step": 2100 }, { "epoch": 2.0426150121065376, "grad_norm": 0.8362380266189575, "learning_rate": 6.902031761008456e-06, "loss": 0.0597, "num_input_tokens_seen": 5250136, "step": 2110 }, { "epoch": 2.052300242130751, "grad_norm": 2.6067404747009277, "learning_rate": 6.774328697198879e-06, "loss": 0.0367, "num_input_tokens_seen": 5274264, "step": 2120 }, { "epoch": 2.0619854721549635, "grad_norm": 1.6327483654022217, "learning_rate": 6.647472598393399e-06, "loss": 0.04, "num_input_tokens_seen": 5298264, "step": 2130 }, { "epoch": 2.0716707021791767, "grad_norm": 1.461899995803833, "learning_rate": 6.521476526465309e-06, "loss": 0.0426, "num_input_tokens_seen": 5322872, "step": 2140 }, { "epoch": 2.08135593220339, "grad_norm": 2.3133087158203125, "learning_rate": 6.3963534547343126e-06, "loss": 0.0706, "num_input_tokens_seen": 5348120, "step": 2150 }, { "epoch": 2.091041162227603, "grad_norm": 3.1375937461853027, "learning_rate": 6.27211626663071e-06, "loss": 0.0377, "num_input_tokens_seen": 5373240, "step": 2160 }, { "epoch": 2.100726392251816, "grad_norm": 2.147362470626831, "learning_rate": 6.148777754368862e-06, "loss": 0.0608, "num_input_tokens_seen": 5398296, "step": 2170 }, { "epoch": 2.110411622276029, "grad_norm": 0.6415455341339111, "learning_rate": 6.026350617630011e-06, "loss": 0.0334, "num_input_tokens_seen": 5424408, "step": 2180 }, { "epoch": 2.120096852300242, "grad_norm": 3.5363268852233887, "learning_rate": 5.904847462254646e-06, "loss": 0.0445, "num_input_tokens_seen": 5449880, "step": 2190 }, { "epoch": 2.1297820823244553, "grad_norm": 2.8637278079986572, "learning_rate": 5.784280798944537e-06, "loss": 0.0735, "num_input_tokens_seen": 5474808, "step": 2200 }, { "epoch": 2.1394673123486685, "grad_norm": 1.1030181646347046, "learning_rate": 5.6646630419745404e-06, "loss": 0.056, "num_input_tokens_seen": 5499672, "step": 2210 }, { "epoch": 2.1491525423728812, "grad_norm": 1.6034140586853027, "learning_rate": 5.5460065079143694e-06, "loss": 0.0703, "num_input_tokens_seen": 5523672, "step": 2220 }, { "epoch": 2.1588377723970944, "grad_norm": 4.010861396789551, "learning_rate": 5.428323414360401e-06, "loss": 0.0504, "num_input_tokens_seen": 5548664, "step": 2230 }, { "epoch": 2.1685230024213076, "grad_norm": 2.1378917694091797, "learning_rate": 5.311625878677658e-06, "loss": 0.0398, "num_input_tokens_seen": 5573944, "step": 2240 }, { "epoch": 2.1782082324455208, "grad_norm": 1.6304939985275269, "learning_rate": 5.195925916752166e-06, "loss": 0.045, "num_input_tokens_seen": 5599224, "step": 2250 }, { "epoch": 2.1878934624697335, "grad_norm": 1.6586905717849731, "learning_rate": 5.081235441753685e-06, "loss": 0.0483, "num_input_tokens_seen": 5623864, "step": 2260 }, { "epoch": 2.1975786924939467, "grad_norm": 2.3342106342315674, "learning_rate": 4.9675662629091055e-06, "loss": 0.0476, "num_input_tokens_seen": 5648760, "step": 2270 }, { "epoch": 2.20726392251816, "grad_norm": 1.122441291809082, "learning_rate": 4.854930084286458e-06, "loss": 0.0537, "num_input_tokens_seen": 5673720, "step": 2280 }, { "epoch": 2.216949152542373, "grad_norm": 0.22967131435871124, "learning_rate": 4.743338503589796e-06, "loss": 0.0567, "num_input_tokens_seen": 5697784, "step": 2290 }, { "epoch": 2.226634382566586, "grad_norm": 3.79902720451355, "learning_rate": 4.632803010965056e-06, "loss": 0.0502, "num_input_tokens_seen": 5722040, "step": 2300 }, { "epoch": 2.236319612590799, "grad_norm": 0.5887905359268188, "learning_rate": 4.523334987816917e-06, "loss": 0.0444, "num_input_tokens_seen": 5747672, "step": 2310 }, { "epoch": 2.246004842615012, "grad_norm": 1.776781678199768, "learning_rate": 4.414945705636949e-06, "loss": 0.0482, "num_input_tokens_seen": 5772056, "step": 2320 }, { "epoch": 2.2556900726392253, "grad_norm": 2.457751512527466, "learning_rate": 4.307646324843004e-06, "loss": 0.0398, "num_input_tokens_seen": 5796728, "step": 2330 }, { "epoch": 2.2653753026634385, "grad_norm": 1.8455132246017456, "learning_rate": 4.201447893630065e-06, "loss": 0.0268, "num_input_tokens_seen": 5822520, "step": 2340 }, { "epoch": 2.275060532687651, "grad_norm": 3.7571520805358887, "learning_rate": 4.096361346832681e-06, "loss": 0.0427, "num_input_tokens_seen": 5847768, "step": 2350 }, { "epoch": 2.2847457627118644, "grad_norm": 4.052141189575195, "learning_rate": 3.992397504799039e-06, "loss": 0.0363, "num_input_tokens_seen": 5873208, "step": 2360 }, { "epoch": 2.2944309927360775, "grad_norm": 2.814667224884033, "learning_rate": 3.889567072276827e-06, "loss": 0.0432, "num_input_tokens_seen": 5897368, "step": 2370 }, { "epoch": 2.3041162227602907, "grad_norm": 0.680135190486908, "learning_rate": 3.78788063731103e-06, "loss": 0.0662, "num_input_tokens_seen": 5921656, "step": 2380 }, { "epoch": 2.3138014527845034, "grad_norm": 4.201208591461182, "learning_rate": 3.6873486701536814e-06, "loss": 0.0434, "num_input_tokens_seen": 5946328, "step": 2390 }, { "epoch": 2.3234866828087166, "grad_norm": 1.828552007675171, "learning_rate": 3.587981522185829e-06, "loss": 0.0425, "num_input_tokens_seen": 5971352, "step": 2400 }, { "epoch": 2.33317191283293, "grad_norm": 0.6704538464546204, "learning_rate": 3.4897894248516736e-06, "loss": 0.0533, "num_input_tokens_seen": 5995544, "step": 2410 }, { "epoch": 2.342857142857143, "grad_norm": 2.377774238586426, "learning_rate": 3.3927824886050555e-06, "loss": 0.0499, "num_input_tokens_seen": 6020600, "step": 2420 }, { "epoch": 2.3525423728813557, "grad_norm": 0.2766050398349762, "learning_rate": 3.2969707018684657e-06, "loss": 0.021, "num_input_tokens_seen": 6045304, "step": 2430 }, { "epoch": 2.362227602905569, "grad_norm": 1.9754971265792847, "learning_rate": 3.202363930004536e-06, "loss": 0.0216, "num_input_tokens_seen": 6070776, "step": 2440 }, { "epoch": 2.371912832929782, "grad_norm": 6.165454387664795, "learning_rate": 3.1089719143002615e-06, "loss": 0.0431, "num_input_tokens_seen": 6095256, "step": 2450 }, { "epoch": 2.3815980629539952, "grad_norm": 2.579355001449585, "learning_rate": 3.016804270963994e-06, "loss": 0.0515, "num_input_tokens_seen": 6120088, "step": 2460 }, { "epoch": 2.3912832929782084, "grad_norm": 1.1952487230300903, "learning_rate": 2.925870490135255e-06, "loss": 0.0349, "num_input_tokens_seen": 6144792, "step": 2470 }, { "epoch": 2.400968523002421, "grad_norm": 0.08051615208387375, "learning_rate": 2.8361799349076143e-06, "loss": 0.0251, "num_input_tokens_seen": 6169688, "step": 2480 }, { "epoch": 2.4106537530266343, "grad_norm": 3.1085357666015625, "learning_rate": 2.747741840364593e-06, "loss": 0.0634, "num_input_tokens_seen": 6194680, "step": 2490 }, { "epoch": 2.4203389830508475, "grad_norm": 1.2273328304290771, "learning_rate": 2.6605653126287555e-06, "loss": 0.0451, "num_input_tokens_seen": 6218712, "step": 2500 }, { "epoch": 2.4300242130750607, "grad_norm": 2.9415712356567383, "learning_rate": 2.5746593279241105e-06, "loss": 0.0395, "num_input_tokens_seen": 6243384, "step": 2510 }, { "epoch": 2.4397094430992734, "grad_norm": 0.24813522398471832, "learning_rate": 2.490032731651833e-06, "loss": 0.0537, "num_input_tokens_seen": 6267416, "step": 2520 }, { "epoch": 2.4493946731234866, "grad_norm": 1.5883897542953491, "learning_rate": 2.4066942374795205e-06, "loss": 0.0402, "num_input_tokens_seen": 6292696, "step": 2530 }, { "epoch": 2.4590799031476998, "grad_norm": 0.41333088278770447, "learning_rate": 2.324652426443962e-06, "loss": 0.0295, "num_input_tokens_seen": 6317208, "step": 2540 }, { "epoch": 2.468765133171913, "grad_norm": 3.1688761711120605, "learning_rate": 2.243915746067587e-06, "loss": 0.0515, "num_input_tokens_seen": 6341688, "step": 2550 }, { "epoch": 2.478450363196126, "grad_norm": 0.7070954442024231, "learning_rate": 2.164492509488657e-06, "loss": 0.0443, "num_input_tokens_seen": 6366712, "step": 2560 }, { "epoch": 2.488135593220339, "grad_norm": 0.3987884819507599, "learning_rate": 2.086390894605288e-06, "loss": 0.0555, "num_input_tokens_seen": 6391256, "step": 2570 }, { "epoch": 2.497820823244552, "grad_norm": 1.7903181314468384, "learning_rate": 2.0096189432334194e-06, "loss": 0.054, "num_input_tokens_seen": 6416184, "step": 2580 }, { "epoch": 2.507506053268765, "grad_norm": 7.973659992218018, "learning_rate": 1.9341845602787733e-06, "loss": 0.075, "num_input_tokens_seen": 6441176, "step": 2590 }, { "epoch": 2.5171912832929784, "grad_norm": 2.1646482944488525, "learning_rate": 1.8600955129229009e-06, "loss": 0.0384, "num_input_tokens_seen": 6465688, "step": 2600 }, { "epoch": 2.526876513317191, "grad_norm": 0.9478936791419983, "learning_rate": 1.7873594298234557e-06, "loss": 0.038, "num_input_tokens_seen": 6490456, "step": 2610 }, { "epoch": 2.5365617433414043, "grad_norm": 0.5018621683120728, "learning_rate": 1.7159838003286848e-06, "loss": 0.0233, "num_input_tokens_seen": 6515704, "step": 2620 }, { "epoch": 2.5462469733656174, "grad_norm": 4.254843711853027, "learning_rate": 1.645975973706269e-06, "loss": 0.0634, "num_input_tokens_seen": 6540920, "step": 2630 }, { "epoch": 2.5559322033898306, "grad_norm": 0.3339782655239105, "learning_rate": 1.5773431583866227e-06, "loss": 0.0333, "num_input_tokens_seen": 6565880, "step": 2640 }, { "epoch": 2.565617433414044, "grad_norm": 2.9373421669006348, "learning_rate": 1.5100924212206534e-06, "loss": 0.0649, "num_input_tokens_seen": 6591000, "step": 2650 }, { "epoch": 2.5753026634382565, "grad_norm": 1.637086033821106, "learning_rate": 1.44423068675212e-06, "loss": 0.0531, "num_input_tokens_seen": 6615800, "step": 2660 }, { "epoch": 2.5849878934624697, "grad_norm": 0.06637797504663467, "learning_rate": 1.3797647365046411e-06, "loss": 0.0426, "num_input_tokens_seen": 6639288, "step": 2670 }, { "epoch": 2.594673123486683, "grad_norm": 0.9268229603767395, "learning_rate": 1.3167012082834212e-06, "loss": 0.0368, "num_input_tokens_seen": 6664632, "step": 2680 }, { "epoch": 2.6043583535108956, "grad_norm": 4.011239528656006, "learning_rate": 1.2550465954917932e-06, "loss": 0.0165, "num_input_tokens_seen": 6689496, "step": 2690 }, { "epoch": 2.614043583535109, "grad_norm": 3.382112741470337, "learning_rate": 1.1948072464626102e-06, "loss": 0.0331, "num_input_tokens_seen": 6714552, "step": 2700 }, { "epoch": 2.623728813559322, "grad_norm": 5.245890140533447, "learning_rate": 1.1359893638045854e-06, "loss": 0.0226, "num_input_tokens_seen": 6739320, "step": 2710 }, { "epoch": 2.633414043583535, "grad_norm": 2.0806005001068115, "learning_rate": 1.0785990037636335e-06, "loss": 0.0611, "num_input_tokens_seen": 6763352, "step": 2720 }, { "epoch": 2.6430992736077483, "grad_norm": 2.040339469909668, "learning_rate": 1.022642075599286e-06, "loss": 0.0615, "num_input_tokens_seen": 6787544, "step": 2730 }, { "epoch": 2.6527845036319615, "grad_norm": 4.939095973968506, "learning_rate": 9.68124340976232e-07, "loss": 0.0393, "num_input_tokens_seen": 6812760, "step": 2740 }, { "epoch": 2.6624697336561742, "grad_norm": 0.7793028354644775, "learning_rate": 9.150514133710647e-07, "loss": 0.0656, "num_input_tokens_seen": 6838008, "step": 2750 }, { "epoch": 2.6721549636803874, "grad_norm": 0.568551778793335, "learning_rate": 8.634287574942834e-07, "loss": 0.0452, "num_input_tokens_seen": 6863320, "step": 2760 }, { "epoch": 2.6818401937046006, "grad_norm": 5.33021354675293, "learning_rate": 8.132616887276212e-07, "loss": 0.0404, "num_input_tokens_seen": 6888824, "step": 2770 }, { "epoch": 2.6915254237288133, "grad_norm": 4.118853569030762, "learning_rate": 7.645553725767229e-07, "loss": 0.0543, "num_input_tokens_seen": 6913048, "step": 2780 }, { "epoch": 2.7012106537530265, "grad_norm": 1.218005895614624, "learning_rate": 7.173148241392957e-07, "loss": 0.0459, "num_input_tokens_seen": 6937432, "step": 2790 }, { "epoch": 2.7108958837772397, "grad_norm": 0.6871452927589417, "learning_rate": 6.71544907588712e-07, "loss": 0.0386, "num_input_tokens_seen": 6962584, "step": 2800 }, { "epoch": 2.720581113801453, "grad_norm": 2.3115310668945312, "learning_rate": 6.272503356731601e-07, "loss": 0.0714, "num_input_tokens_seen": 6987768, "step": 2810 }, { "epoch": 2.730266343825666, "grad_norm": 4.2863569259643555, "learning_rate": 5.84435669230401e-07, "loss": 0.0364, "num_input_tokens_seen": 7013336, "step": 2820 }, { "epoch": 2.739951573849879, "grad_norm": 0.879754900932312, "learning_rate": 5.431053167181515e-07, "loss": 0.0346, "num_input_tokens_seen": 7038648, "step": 2830 }, { "epoch": 2.749636803874092, "grad_norm": 1.9641544818878174, "learning_rate": 5.032635337601687e-07, "loss": 0.0337, "num_input_tokens_seen": 7064184, "step": 2840 }, { "epoch": 2.759322033898305, "grad_norm": 0.6523151993751526, "learning_rate": 4.6491442270805596e-07, "loss": 0.0229, "num_input_tokens_seen": 7089336, "step": 2850 }, { "epoch": 2.7690072639225183, "grad_norm": 0.46984636783599854, "learning_rate": 4.280619322188628e-07, "loss": 0.0472, "num_input_tokens_seen": 7114072, "step": 2860 }, { "epoch": 2.778692493946731, "grad_norm": 2.178297519683838, "learning_rate": 3.9270985684851545e-07, "loss": 0.0498, "num_input_tokens_seen": 7139576, "step": 2870 }, { "epoch": 2.788377723970944, "grad_norm": 3.751574993133545, "learning_rate": 3.588618366610941e-07, "loss": 0.0442, "num_input_tokens_seen": 7165432, "step": 2880 }, { "epoch": 2.7980629539951574, "grad_norm": 1.0459034442901611, "learning_rate": 3.2652135685403593e-07, "loss": 0.0324, "num_input_tokens_seen": 7190808, "step": 2890 }, { "epoch": 2.8077481840193705, "grad_norm": 3.6684751510620117, "learning_rate": 2.9569174739928096e-07, "loss": 0.0497, "num_input_tokens_seen": 7216440, "step": 2900 }, { "epoch": 2.8174334140435837, "grad_norm": 4.388014316558838, "learning_rate": 2.663761827003941e-07, "loss": 0.0404, "num_input_tokens_seen": 7243480, "step": 2910 }, { "epoch": 2.8271186440677964, "grad_norm": 6.251937389373779, "learning_rate": 2.38577681265707e-07, "loss": 0.0479, "num_input_tokens_seen": 7268568, "step": 2920 }, { "epoch": 2.8368038740920096, "grad_norm": 2.676504611968994, "learning_rate": 2.122991053975215e-07, "loss": 0.0378, "num_input_tokens_seen": 7293784, "step": 2930 }, { "epoch": 2.846489104116223, "grad_norm": 4.877316474914551, "learning_rate": 1.8754316089737878e-07, "loss": 0.0328, "num_input_tokens_seen": 7318680, "step": 2940 }, { "epoch": 2.856174334140436, "grad_norm": 1.454691767692566, "learning_rate": 1.6431239678746546e-07, "loss": 0.0411, "num_input_tokens_seen": 7343864, "step": 2950 }, { "epoch": 2.8658595641646487, "grad_norm": 3.7415764331817627, "learning_rate": 1.4260920504814366e-07, "loss": 0.0649, "num_input_tokens_seen": 7370232, "step": 2960 }, { "epoch": 2.875544794188862, "grad_norm": 2.577986240386963, "learning_rate": 1.22435820371658e-07, "loss": 0.0462, "num_input_tokens_seen": 7394936, "step": 2970 }, { "epoch": 2.885230024213075, "grad_norm": 4.861838340759277, "learning_rate": 1.0379431993204458e-07, "loss": 0.0425, "num_input_tokens_seen": 7420088, "step": 2980 }, { "epoch": 2.8949152542372882, "grad_norm": 3.2706315517425537, "learning_rate": 8.668662317124043e-08, "loss": 0.0418, "num_input_tokens_seen": 7445048, "step": 2990 }, { "epoch": 2.9046004842615014, "grad_norm": 0.6351612210273743, "learning_rate": 7.111449160146333e-08, "loss": 0.022, "num_input_tokens_seen": 7469144, "step": 3000 }, { "epoch": 2.914285714285714, "grad_norm": 2.6043741703033447, "learning_rate": 5.7079528623816824e-08, "loss": 0.0529, "num_input_tokens_seen": 7493528, "step": 3010 }, { "epoch": 2.9239709443099273, "grad_norm": 0.3836284875869751, "learning_rate": 4.4583179363210656e-08, "loss": 0.0335, "num_input_tokens_seen": 7517560, "step": 3020 }, { "epoch": 2.9336561743341405, "grad_norm": 0.7341143488883972, "learning_rate": 3.3626730519551455e-08, "loss": 0.0338, "num_input_tokens_seen": 7542552, "step": 3030 }, { "epoch": 2.9433414043583537, "grad_norm": 2.3991236686706543, "learning_rate": 2.4211310235258687e-08, "loss": 0.0403, "num_input_tokens_seen": 7566968, "step": 3040 }, { "epoch": 2.9530266343825664, "grad_norm": 1.5679802894592285, "learning_rate": 1.633788797910929e-08, "loss": 0.0259, "num_input_tokens_seen": 7591672, "step": 3050 }, { "epoch": 2.9627118644067796, "grad_norm": 0.794366717338562, "learning_rate": 1.0007274446409143e-08, "loss": 0.0392, "num_input_tokens_seen": 7616536, "step": 3060 }, { "epoch": 2.9723970944309928, "grad_norm": 1.9847420454025269, "learning_rate": 5.220121475519868e-09, "loss": 0.0487, "num_input_tokens_seen": 7640824, "step": 3070 }, { "epoch": 2.982082324455206, "grad_norm": 3.2490248680114746, "learning_rate": 1.976921980745838e-09, "loss": 0.056, "num_input_tokens_seen": 7666328, "step": 3080 }, { "epoch": 2.991767554479419, "grad_norm": 0.0969802513718605, "learning_rate": 2.780099015747828e-10, "loss": 0.0201, "num_input_tokens_seen": 7691224, "step": 3090 }, { "epoch": 2.9975786924939465, "num_input_tokens_seen": 7706072, "step": 3096, "total_flos": 3.298866475009966e+17, "train_loss": 0.08281170262038245, "train_runtime": 2763.8125, "train_samples_per_second": 8.963, "train_steps_per_second": 1.12 } ], "logging_steps": 10, "max_steps": 3096, "num_input_tokens_seen": 7706072, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.298866475009966e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }