| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100.0, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 0.09379793703556061, | |
| "learning_rate": 5.999999999999999e-06, | |
| "loss": 0.6799, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.1399833709001541, | |
| "learning_rate": 1.3499999999999998e-05, | |
| "loss": 0.6954, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.08632303029298782, | |
| "learning_rate": 2.1e-05, | |
| "loss": 0.6921, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.10006701201200485, | |
| "learning_rate": 2.8499999999999998e-05, | |
| "loss": 0.69, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.07633858919143677, | |
| "learning_rate": 3.5999999999999994e-05, | |
| "loss": 0.6722, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.09399061650037766, | |
| "learning_rate": 4.3499999999999993e-05, | |
| "loss": 0.6453, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.0843738541007042, | |
| "learning_rate": 5.1e-05, | |
| "loss": 0.6276, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.08583351224660873, | |
| "learning_rate": 5.85e-05, | |
| "loss": 0.58, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.09571370482444763, | |
| "learning_rate": 6.599999999999999e-05, | |
| "loss": 0.6355, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.1083935871720314, | |
| "learning_rate": 7.35e-05, | |
| "loss": 0.589, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.10387319326400757, | |
| "learning_rate": 8.1e-05, | |
| "loss": 0.6061, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.11083361506462097, | |
| "learning_rate": 8.849999999999998e-05, | |
| "loss": 0.572, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.12665686011314392, | |
| "learning_rate": 9.599999999999999e-05, | |
| "loss": 0.5442, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.1308053582906723, | |
| "learning_rate": 0.00010349999999999998, | |
| "loss": 0.6524, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.13535510003566742, | |
| "learning_rate": 0.00011099999999999999, | |
| "loss": 0.6404, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.12833671271800995, | |
| "learning_rate": 0.0001185, | |
| "loss": 0.5717, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.11962099373340607, | |
| "learning_rate": 0.00012599999999999997, | |
| "loss": 0.6098, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.13898271322250366, | |
| "learning_rate": 0.0001335, | |
| "loss": 0.6099, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.14486610889434814, | |
| "learning_rate": 0.00014099999999999998, | |
| "loss": 0.5744, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.1432138830423355, | |
| "learning_rate": 0.00014849999999999998, | |
| "loss": 0.5659, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.13487878441810608, | |
| "learning_rate": 0.000156, | |
| "loss": 0.5622, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.12495309859514236, | |
| "learning_rate": 0.0001635, | |
| "loss": 0.5951, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.13011734187602997, | |
| "learning_rate": 0.00017099999999999998, | |
| "loss": 0.6249, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.13987745344638824, | |
| "learning_rate": 0.00017849999999999997, | |
| "loss": 0.559, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.13373605906963348, | |
| "learning_rate": 0.000186, | |
| "loss": 0.5475, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.12433867901563644, | |
| "learning_rate": 0.0001935, | |
| "loss": 0.5274, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.11097615957260132, | |
| "learning_rate": 0.000201, | |
| "loss": 0.678, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.1155027225613594, | |
| "learning_rate": 0.00020849999999999997, | |
| "loss": 0.5611, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.11431068181991577, | |
| "learning_rate": 0.00021599999999999996, | |
| "loss": 0.6054, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.09796140342950821, | |
| "learning_rate": 0.00022349999999999998, | |
| "loss": 0.5472, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.09489257633686066, | |
| "learning_rate": 0.00023099999999999998, | |
| "loss": 0.4636, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.10787788033485413, | |
| "learning_rate": 0.0002385, | |
| "loss": 0.6164, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.10261733084917068, | |
| "learning_rate": 0.00024599999999999996, | |
| "loss": 0.5408, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.11870352178812027, | |
| "learning_rate": 0.0002535, | |
| "loss": 0.5268, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.11910569667816162, | |
| "learning_rate": 0.000261, | |
| "loss": 0.5461, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.10083702206611633, | |
| "learning_rate": 0.00026849999999999997, | |
| "loss": 0.4794, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.10453511029481888, | |
| "learning_rate": 0.000276, | |
| "loss": 0.5539, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.101403146982193, | |
| "learning_rate": 0.00028349999999999995, | |
| "loss": 0.5346, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.10724789649248123, | |
| "learning_rate": 0.00029099999999999997, | |
| "loss": 0.6026, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.1140277311205864, | |
| "learning_rate": 0.0002985, | |
| "loss": 0.5193, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.09706108272075653, | |
| "learning_rate": 0.0002999963446058092, | |
| "loss": 0.54, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.10003062337636948, | |
| "learning_rate": 0.0002999814948722491, | |
| "loss": 0.5365, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.1078687533736229, | |
| "learning_rate": 0.00029995522346717746, | |
| "loss": 0.5889, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.10538115352392197, | |
| "learning_rate": 0.0002999175323912636, | |
| "loss": 0.5611, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.1020808294415474, | |
| "learning_rate": 0.00029986842451482874, | |
| "loss": 0.6103, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.09635835886001587, | |
| "learning_rate": 0.0002998079035776279, | |
| "loss": 0.5229, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.10287190228700638, | |
| "learning_rate": 0.0002997359741885648, | |
| "loss": 0.5312, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.09160075336694717, | |
| "learning_rate": 0.0002996526418253408, | |
| "loss": 0.5673, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.08691006153821945, | |
| "learning_rate": 0.000299557912834038, | |
| "loss": 0.5326, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.10096988826990128, | |
| "learning_rate": 0.00029945179442863594, | |
| "loss": 0.6004, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.09594204276800156, | |
| "learning_rate": 0.000299334294690462, | |
| "loss": 0.5516, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.10281919687986374, | |
| "learning_rate": 0.00029920542256757607, | |
| "loss": 0.5515, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.08547840267419815, | |
| "learning_rate": 0.00029906518787408944, | |
| "loss": 0.5243, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.10161560773849487, | |
| "learning_rate": 0.0002989136012894168, | |
| "loss": 0.5096, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.09101904183626175, | |
| "learning_rate": 0.0002987506743574635, | |
| "loss": 0.553, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.09769442677497864, | |
| "learning_rate": 0.0002985764194857463, | |
| "loss": 0.4953, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.10991579294204712, | |
| "learning_rate": 0.00029839084994444826, | |
| "loss": 0.5152, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.09450916200876236, | |
| "learning_rate": 0.00029819397986540836, | |
| "loss": 0.5397, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.10876069217920303, | |
| "learning_rate": 0.0002979858242410454, | |
| "loss": 0.4858, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.097995825111866, | |
| "learning_rate": 0.00029776639892321606, | |
| "loss": 0.5566, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.1145048514008522, | |
| "learning_rate": 0.0002975357206220079, | |
| "loss": 0.4531, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.10271880775690079, | |
| "learning_rate": 0.00029729380690446654, | |
| "loss": 0.5199, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.11095371842384338, | |
| "learning_rate": 0.0002970406761932583, | |
| "loss": 0.5416, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.09949438273906708, | |
| "learning_rate": 0.00029677634776526673, | |
| "loss": 0.4841, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.1163724958896637, | |
| "learning_rate": 0.00029650084175012517, | |
| "loss": 0.4913, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.10726840049028397, | |
| "learning_rate": 0.00029621417912868323, | |
| "loss": 0.5203, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.09609931707382202, | |
| "learning_rate": 0.00029591638173140947, | |
| "loss": 0.5607, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.10824442654848099, | |
| "learning_rate": 0.0002956074722367286, | |
| "loss": 0.6004, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.10465679317712784, | |
| "learning_rate": 0.00029528747416929463, | |
| "loss": 0.5216, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.10518354922533035, | |
| "learning_rate": 0.0002949564118981994, | |
| "loss": 0.499, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.0955279991030693, | |
| "learning_rate": 0.0002946143106351165, | |
| "loss": 0.5607, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.11159654706716537, | |
| "learning_rate": 0.0002942611964323817, | |
| "loss": 0.5204, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.09571187198162079, | |
| "learning_rate": 0.0002938970961810086, | |
| "loss": 0.6113, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.11854679882526398, | |
| "learning_rate": 0.0002935220376086411, | |
| "loss": 0.5639, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.1050512045621872, | |
| "learning_rate": 0.0002931360492774415, | |
| "loss": 0.548, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.1053968220949173, | |
| "learning_rate": 0.0002927391605819157, | |
| "loss": 0.5507, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.10567320138216019, | |
| "learning_rate": 0.00029233140174667445, | |
| "loss": 0.5312, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.11914283782243729, | |
| "learning_rate": 0.0002919128038241318, | |
| "loss": 0.5961, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.09915795922279358, | |
| "learning_rate": 0.0002914833986921401, | |
| "loss": 0.5086, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.10796502232551575, | |
| "learning_rate": 0.0002910432190515628, | |
| "loss": 0.5585, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.10748997330665588, | |
| "learning_rate": 0.00029059229842378373, | |
| "loss": 0.5466, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.10696308314800262, | |
| "learning_rate": 0.0002901306711481544, | |
| "loss": 0.5513, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.10418657958507538, | |
| "learning_rate": 0.0002896583723793792, | |
| "loss": 0.5391, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.16421550512313843, | |
| "learning_rate": 0.00028917543808483796, | |
| "loss": 0.4699, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.12929962575435638, | |
| "learning_rate": 0.00028868190504184696, | |
| "loss": 0.4984, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.10469454526901245, | |
| "learning_rate": 0.00028817781083485816, | |
| "loss": 0.5119, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.0964970663189888, | |
| "learning_rate": 0.00028766319385259713, | |
| "loss": 0.5167, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.12395574152469635, | |
| "learning_rate": 0.00028713809328513953, | |
| "loss": 0.5692, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.10189738124608994, | |
| "learning_rate": 0.0002866025491209265, | |
| "loss": 0.4628, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.10433454066514969, | |
| "learning_rate": 0.0002860566021437197, | |
| "loss": 0.4869, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 0.13003456592559814, | |
| "learning_rate": 0.0002855002939294951, | |
| "loss": 0.5291, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.11692202836275101, | |
| "learning_rate": 0.000284933666843277, | |
| "loss": 0.5229, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 0.10757846385240555, | |
| "learning_rate": 0.0002843567640359119, | |
| "loss": 0.435, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.10775501281023026, | |
| "learning_rate": 0.00028376962944078206, | |
| "loss": 0.4418, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.11543692648410797, | |
| "learning_rate": 0.00028317230777046015, | |
| "loss": 0.4204, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.10946698486804962, | |
| "learning_rate": 0.00028256484451330403, | |
| "loss": 0.49, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 0.11528221517801285, | |
| "learning_rate": 0.00028194728592999247, | |
| "loss": 0.4752, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.10474205762147903, | |
| "learning_rate": 0.0002813196790500027, | |
| "loss": 0.4847, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 0.10768820345401764, | |
| "learning_rate": 0.00028068207166802837, | |
| "loss": 0.4664, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.12158560007810593, | |
| "learning_rate": 0.00028003451234034037, | |
| "loss": 0.4741, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 0.11635497957468033, | |
| "learning_rate": 0.0002793770503810886, | |
| "loss": 0.4969, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.12205849587917328, | |
| "learning_rate": 0.00027870973585854665, | |
| "loss": 0.4798, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 0.10270871222019196, | |
| "learning_rate": 0.00027803261959129905, | |
| "loss": 0.3888, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.11313367635011673, | |
| "learning_rate": 0.0002773457531443712, | |
| "loss": 0.4759, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.12905193865299225, | |
| "learning_rate": 0.00027664918882530225, | |
| "loss": 0.4442, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.11690939962863922, | |
| "learning_rate": 0.00027594297968016197, | |
| "loss": 0.5535, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 0.10021405667066574, | |
| "learning_rate": 0.00027522717948951094, | |
| "loss": 0.4717, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.10104178637266159, | |
| "learning_rate": 0.0002745018427643051, | |
| "loss": 0.4906, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 0.12113891541957855, | |
| "learning_rate": 0.00027376702474174425, | |
| "loss": 0.5674, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.11330476403236389, | |
| "learning_rate": 0.0002730227813810658, | |
| "loss": 0.5184, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 0.1025850847363472, | |
| "learning_rate": 0.0002722691693592831, | |
| "loss": 0.4395, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.11591499298810959, | |
| "learning_rate": 0.0002715062460668694, | |
| "loss": 0.5003, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 0.11281153559684753, | |
| "learning_rate": 0.0002707340696033871, | |
| "loss": 0.4672, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.1123538464307785, | |
| "learning_rate": 0.00026995269877306356, | |
| "loss": 0.513, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.10776390135288239, | |
| "learning_rate": 0.0002691621930803127, | |
| "loss": 0.4572, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.10008667409420013, | |
| "learning_rate": 0.0002683626127252036, | |
| "loss": 0.4618, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 0.13961340487003326, | |
| "learning_rate": 0.00026755401859887595, | |
| "loss": 0.4819, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.1476685106754303, | |
| "learning_rate": 0.00026673647227890316, | |
| "loss": 0.4964, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 0.09795507788658142, | |
| "learning_rate": 0.00026591003602460263, | |
| "loss": 0.4796, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.10903532058000565, | |
| "learning_rate": 0.00026507477277229496, | |
| "loss": 0.4775, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 0.10258448123931885, | |
| "learning_rate": 0.0002642307461305105, | |
| "loss": 0.4519, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.11204435676336288, | |
| "learning_rate": 0.0002633780203751459, | |
| "loss": 0.4451, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 0.10147629678249359, | |
| "learning_rate": 0.0002625166604445689, | |
| "loss": 0.4256, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.10481107234954834, | |
| "learning_rate": 0.00026164673193467306, | |
| "loss": 0.4381, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.10856641829013824, | |
| "learning_rate": 0.00026076830109388255, | |
| "loss": 0.4958, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.09918677806854248, | |
| "learning_rate": 0.0002598814348181068, | |
| "loss": 0.4335, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 0.10417389869689941, | |
| "learning_rate": 0.00025898620064564637, | |
| "loss": 0.4603, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.0903329998254776, | |
| "learning_rate": 0.00025808266675204954, | |
| "loss": 0.3932, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 0.11511855572462082, | |
| "learning_rate": 0.0002571709019449205, | |
| "loss": 0.4169, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.11355557292699814, | |
| "learning_rate": 0.0002562509756586793, | |
| "loss": 0.4455, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 0.1271187961101532, | |
| "learning_rate": 0.00025532295794927437, | |
| "loss": 0.4902, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.11936645954847336, | |
| "learning_rate": 0.0002543869194888471, | |
| "loss": 0.4843, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 0.11935465037822723, | |
| "learning_rate": 0.00025344293156035044, | |
| "loss": 0.4402, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.13073407113552094, | |
| "learning_rate": 0.00025249106605211986, | |
| "loss": 0.467, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.10340435802936554, | |
| "learning_rate": 0.0002515313954523991, | |
| "loss": 0.4827, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.11634550243616104, | |
| "learning_rate": 0.00025056399284381983, | |
| "loss": 0.466, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 0.10582319647073746, | |
| "learning_rate": 0.0002495889318978362, | |
| "loss": 0.4751, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.16781780123710632, | |
| "learning_rate": 0.00024860628686911436, | |
| "loss": 0.4717, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 0.11522196233272552, | |
| "learning_rate": 0.0002476161325898776, | |
| "loss": 0.4687, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.11830449104309082, | |
| "learning_rate": 0.000246618544464208, | |
| "loss": 0.436, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3525, | |
| "grad_norm": 0.17485427856445312, | |
| "learning_rate": 0.0002456135984623034, | |
| "loss": 0.4284, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.12288108468055725, | |
| "learning_rate": 0.00024460137111469296, | |
| "loss": 0.4261, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3575, | |
| "grad_norm": 0.11587081104516983, | |
| "learning_rate": 0.0002435819395064079, | |
| "loss": 0.4493, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.10690271109342575, | |
| "learning_rate": 0.0002425553812711123, | |
| "loss": 0.4648, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3625, | |
| "grad_norm": 0.10404397547245026, | |
| "learning_rate": 0.00024152177458519014, | |
| "loss": 0.4634, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.11986954510211945, | |
| "learning_rate": 0.00024048119816179236, | |
| "loss": 0.4525, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3675, | |
| "grad_norm": 0.10243026167154312, | |
| "learning_rate": 0.00023943373124484234, | |
| "loss": 0.4572, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.10386748611927032, | |
| "learning_rate": 0.00023837945360300129, | |
| "loss": 0.3884, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3725, | |
| "grad_norm": 0.11165735125541687, | |
| "learning_rate": 0.0002373184455235934, | |
| "loss": 0.4902, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.09951601922512054, | |
| "learning_rate": 0.00023625078780649178, | |
| "loss": 0.4541, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3775, | |
| "grad_norm": 0.10347504913806915, | |
| "learning_rate": 0.00023517656175796518, | |
| "loss": 0.3871, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.10478132963180542, | |
| "learning_rate": 0.00023409584918448627, | |
| "loss": 0.4329, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3825, | |
| "grad_norm": 0.1198212131857872, | |
| "learning_rate": 0.00023300873238650159, | |
| "loss": 0.425, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.1103711724281311, | |
| "learning_rate": 0.00023191529415216434, | |
| "loss": 0.4274, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3875, | |
| "grad_norm": 0.09940385073423386, | |
| "learning_rate": 0.00023081561775102944, | |
| "loss": 0.4368, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.11599268019199371, | |
| "learning_rate": 0.00022970978692771242, | |
| "loss": 0.4386, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3925, | |
| "grad_norm": 0.10101296752691269, | |
| "learning_rate": 0.00022859788589551188, | |
| "loss": 0.4696, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.10112808644771576, | |
| "learning_rate": 0.00022747999932999624, | |
| "loss": 0.4066, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3975, | |
| "grad_norm": 0.09595459699630737, | |
| "learning_rate": 0.00022635621236255567, | |
| "loss": 0.4837, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.10761380940675735, | |
| "learning_rate": 0.00022522661057391857, | |
| "loss": 0.5446, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4025, | |
| "grad_norm": 0.11919954419136047, | |
| "learning_rate": 0.00022409127998763463, | |
| "loss": 0.5027, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.10851597785949707, | |
| "learning_rate": 0.00022295030706352356, | |
| "loss": 0.4481, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4075, | |
| "grad_norm": 0.10030311346054077, | |
| "learning_rate": 0.00022180377869109104, | |
| "loss": 0.4709, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.111280657351017, | |
| "learning_rate": 0.00022065178218291147, | |
| "loss": 0.4423, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4125, | |
| "grad_norm": 0.11253602802753448, | |
| "learning_rate": 0.00021949440526797926, | |
| "loss": 0.4136, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.10805424302816391, | |
| "learning_rate": 0.00021833173608502732, | |
| "loss": 0.4656, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4175, | |
| "grad_norm": 0.10983198881149292, | |
| "learning_rate": 0.00021716386317581542, | |
| "loss": 0.3687, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.10653118044137955, | |
| "learning_rate": 0.00021599087547838727, | |
| "loss": 0.4654, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4225, | |
| "grad_norm": 0.10856354981660843, | |
| "learning_rate": 0.00021481286232029735, | |
| "loss": 0.4298, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.11233706772327423, | |
| "learning_rate": 0.0002136299134118085, | |
| "loss": 0.4484, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4275, | |
| "grad_norm": 0.1085442528128624, | |
| "learning_rate": 0.00021244211883906017, | |
| "loss": 0.4776, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.12297824025154114, | |
| "learning_rate": 0.0002112495690572077, | |
| "loss": 0.4029, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4325, | |
| "grad_norm": 0.10838114470243454, | |
| "learning_rate": 0.00021005235488353428, | |
| "loss": 0.4848, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.10273341834545135, | |
| "learning_rate": 0.0002088505674905342, | |
| "loss": 0.3989, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.11189126968383789, | |
| "learning_rate": 0.0002076442983989705, | |
| "loss": 0.438, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.11592905968427658, | |
| "learning_rate": 0.0002064336394709048, | |
| "loss": 0.4786, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4425, | |
| "grad_norm": 0.11230389773845673, | |
| "learning_rate": 0.0002052186829027017, | |
| "loss": 0.3999, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.12455113977193832, | |
| "learning_rate": 0.00020399952121800767, | |
| "loss": 0.4856, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4475, | |
| "grad_norm": 0.1001812294125557, | |
| "learning_rate": 0.00020277624726070526, | |
| "loss": 0.4689, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.11319112777709961, | |
| "learning_rate": 0.00020154895418784242, | |
| "loss": 0.3998, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4525, | |
| "grad_norm": 0.11322236061096191, | |
| "learning_rate": 0.00020031773546253824, | |
| "loss": 0.4321, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 0.12924689054489136, | |
| "learning_rate": 0.00019908268484686558, | |
| "loss": 0.4208, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4575, | |
| "grad_norm": 0.11435618251562119, | |
| "learning_rate": 0.00019784389639471048, | |
| "loss": 0.4682, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.10801081359386444, | |
| "learning_rate": 0.00019660146444460975, | |
| "loss": 0.428, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4625, | |
| "grad_norm": 0.10906939953565598, | |
| "learning_rate": 0.0001953554836125667, | |
| "loss": 0.4455, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 0.10790123790502548, | |
| "learning_rate": 0.00019410604878484556, | |
| "loss": 0.4544, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4675, | |
| "grad_norm": 0.10536376386880875, | |
| "learning_rate": 0.000192853255110746, | |
| "loss": 0.376, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.11744682490825653, | |
| "learning_rate": 0.00019159719799535668, | |
| "loss": 0.3887, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4725, | |
| "grad_norm": 0.12954068183898926, | |
| "learning_rate": 0.00019033797309228983, | |
| "loss": 0.4075, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.1401606798171997, | |
| "learning_rate": 0.00018907567629639725, | |
| "loss": 0.4454, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.4775, | |
| "grad_norm": 0.12059322744607925, | |
| "learning_rate": 0.00018781040373646706, | |
| "loss": 0.4339, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.11798987537622452, | |
| "learning_rate": 0.00018654225176790336, | |
| "loss": 0.4405, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.4825, | |
| "grad_norm": 0.11344211548566818, | |
| "learning_rate": 0.00018527131696538846, | |
| "loss": 0.4124, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 0.10373330116271973, | |
| "learning_rate": 0.00018399769611552824, | |
| "loss": 0.4329, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.4875, | |
| "grad_norm": 0.12053704261779785, | |
| "learning_rate": 0.0001827214862094814, | |
| "loss": 0.4944, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.141033336520195, | |
| "learning_rate": 0.00018144278443557328, | |
| "loss": 0.4569, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4925, | |
| "grad_norm": 0.10922867804765701, | |
| "learning_rate": 0.0001801616881718947, | |
| "loss": 0.3879, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 0.09843657910823822, | |
| "learning_rate": 0.00017887829497888612, | |
| "loss": 0.4106, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4975, | |
| "grad_norm": 0.12131062150001526, | |
| "learning_rate": 0.000177592702591908, | |
| "loss": 0.4023, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.11343283206224442, | |
| "learning_rate": 0.00017630500891379806, | |
| "loss": 0.4824, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5025, | |
| "grad_norm": 0.11050508171319962, | |
| "learning_rate": 0.00017501531200741534, | |
| "loss": 0.4098, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 0.11737144738435745, | |
| "learning_rate": 0.00017372371008817256, | |
| "loss": 0.3943, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5075, | |
| "grad_norm": 0.11473528295755386, | |
| "learning_rate": 0.00017243030151655643, | |
| "loss": 0.3796, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.13086555898189545, | |
| "learning_rate": 0.00017113518479063738, | |
| "loss": 0.4367, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5125, | |
| "grad_norm": 0.11752833425998688, | |
| "learning_rate": 0.00016983845853856837, | |
| "loss": 0.4097, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 0.11596900969743729, | |
| "learning_rate": 0.0001685402215110739, | |
| "loss": 0.3812, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5175, | |
| "grad_norm": 0.11850260943174362, | |
| "learning_rate": 0.00016724057257392998, | |
| "loss": 0.4354, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.12466365844011307, | |
| "learning_rate": 0.00016593961070043498, | |
| "loss": 0.4317, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5225, | |
| "grad_norm": 0.11178991943597794, | |
| "learning_rate": 0.0001646374349638724, | |
| "loss": 0.3936, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.11252165585756302, | |
| "learning_rate": 0.00016333414452996623, | |
| "loss": 0.386, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5275, | |
| "grad_norm": 0.12886975705623627, | |
| "learning_rate": 0.0001620298386493288, | |
| "loss": 0.3965, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.11716549098491669, | |
| "learning_rate": 0.00016072461664990288, | |
| "loss": 0.3924, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5325, | |
| "grad_norm": 0.11604485660791397, | |
| "learning_rate": 0.000159418577929397, | |
| "loss": 0.3624, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 0.11538460850715637, | |
| "learning_rate": 0.00015811182194771633, | |
| "loss": 0.4338, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5375, | |
| "grad_norm": 0.11618762463331223, | |
| "learning_rate": 0.00015680444821938804, | |
| "loss": 0.4058, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.11750835925340652, | |
| "learning_rate": 0.00015549655630598343, | |
| "loss": 0.4422, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5425, | |
| "grad_norm": 0.12725204229354858, | |
| "learning_rate": 0.00015418824580853535, | |
| "loss": 0.4422, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 0.11274927109479904, | |
| "learning_rate": 0.00015287961635995347, | |
| "loss": 0.4229, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5475, | |
| "grad_norm": 0.11833129078149796, | |
| "learning_rate": 0.00015157076761743686, | |
| "loss": 0.4442, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.11384794861078262, | |
| "learning_rate": 0.00015026179925488475, | |
| "loss": 0.4528, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5525, | |
| "grad_norm": 0.11864661425352097, | |
| "learning_rate": 0.00014895281095530575, | |
| "loss": 0.3988, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 0.11673832684755325, | |
| "learning_rate": 0.00014764390240322691, | |
| "loss": 0.3544, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5575, | |
| "grad_norm": 0.1174502745270729, | |
| "learning_rate": 0.00014633517327710202, | |
| "loss": 0.4034, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.12685547769069672, | |
| "learning_rate": 0.00014502672324172107, | |
| "loss": 0.3595, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 0.12368053942918777, | |
| "learning_rate": 0.00014371865194062007, | |
| "loss": 0.3395, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.565, | |
| "grad_norm": 0.1077839657664299, | |
| "learning_rate": 0.000142411058988493, | |
| "loss": 0.4199, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.5675, | |
| "grad_norm": 0.11699855327606201, | |
| "learning_rate": 0.00014110404396360576, | |
| "loss": 0.3443, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.13238464295864105, | |
| "learning_rate": 0.0001397977064002128, | |
| "loss": 0.3499, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5725, | |
| "grad_norm": 0.11482933163642883, | |
| "learning_rate": 0.0001384921457809772, | |
| "loss": 0.3619, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.13390353322029114, | |
| "learning_rate": 0.00013718746152939487, | |
| "loss": 0.3684, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5775, | |
| "grad_norm": 0.11464900523424149, | |
| "learning_rate": 0.00013588375300222283, | |
| "loss": 0.3313, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.10367871820926666, | |
| "learning_rate": 0.00013458111948191296, | |
| "loss": 0.3323, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5825, | |
| "grad_norm": 0.12259294092655182, | |
| "learning_rate": 0.0001332796601690512, | |
| "loss": 0.3986, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.585, | |
| "grad_norm": 0.10923358052968979, | |
| "learning_rate": 0.00013197947417480292, | |
| "loss": 0.3808, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5875, | |
| "grad_norm": 0.12479504942893982, | |
| "learning_rate": 0.0001306806605133656, | |
| "loss": 0.4429, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.11521733552217484, | |
| "learning_rate": 0.000129383318094428, | |
| "loss": 0.4778, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5925, | |
| "grad_norm": 0.14112086594104767, | |
| "learning_rate": 0.00012808754571563827, | |
| "loss": 0.4634, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.595, | |
| "grad_norm": 0.12947902083396912, | |
| "learning_rate": 0.00012679344205507981, | |
| "loss": 0.4439, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5975, | |
| "grad_norm": 0.13288578391075134, | |
| "learning_rate": 0.0001255011056637567, | |
| "loss": 0.4402, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.1216069906949997, | |
| "learning_rate": 0.00012421063495808853, | |
| "loss": 0.4203, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6025, | |
| "grad_norm": 0.11649637669324875, | |
| "learning_rate": 0.000122922128212416, | |
| "loss": 0.4512, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.605, | |
| "grad_norm": 0.1201406940817833, | |
| "learning_rate": 0.00012163568355151628, | |
| "loss": 0.3725, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6075, | |
| "grad_norm": 0.12117727100849152, | |
| "learning_rate": 0.00012035139894313107, | |
| "loss": 0.4352, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.11709322035312653, | |
| "learning_rate": 0.00011906937219050556, | |
| "loss": 0.4189, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6125, | |
| "grad_norm": 0.11865726858377457, | |
| "learning_rate": 0.0001177897009249405, | |
| "loss": 0.3796, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.615, | |
| "grad_norm": 0.10807759314775467, | |
| "learning_rate": 0.0001165124825983573, | |
| "loss": 0.4465, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6175, | |
| "grad_norm": 0.13788209855556488, | |
| "learning_rate": 0.00011523781447587641, | |
| "loss": 0.4994, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.12921364605426788, | |
| "learning_rate": 0.00011396579362841044, | |
| "loss": 0.4251, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6225, | |
| "grad_norm": 0.12162365019321442, | |
| "learning_rate": 0.0001126965169252718, | |
| "loss": 0.3864, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.12897826731204987, | |
| "learning_rate": 0.00011143008102679559, | |
| "loss": 0.3753, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6275, | |
| "grad_norm": 0.116109699010849, | |
| "learning_rate": 0.00011016658237697866, | |
| "loss": 0.3296, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.12935414910316467, | |
| "learning_rate": 0.00010890611719613512, | |
| "loss": 0.3797, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6325, | |
| "grad_norm": 0.13730891048908234, | |
| "learning_rate": 0.0001076487814735685, | |
| "loss": 0.3711, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.635, | |
| "grad_norm": 0.13870631158351898, | |
| "learning_rate": 0.00010639467096026211, | |
| "loss": 0.4328, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6375, | |
| "grad_norm": 0.11644043773412704, | |
| "learning_rate": 0.00010514388116158701, | |
| "loss": 0.3283, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.12221091985702515, | |
| "learning_rate": 0.00010389650733002894, | |
| "loss": 0.3898, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6425, | |
| "grad_norm": 0.12048634141683578, | |
| "learning_rate": 0.00010265264445793464, | |
| "loss": 0.3256, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.645, | |
| "grad_norm": 0.1250566840171814, | |
| "learning_rate": 0.00010141238727027761, | |
| "loss": 0.408, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6475, | |
| "grad_norm": 0.13518592715263367, | |
| "learning_rate": 0.00010017583021744454, | |
| "loss": 0.3763, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.13047736883163452, | |
| "learning_rate": 9.89430674680425e-05, | |
| "loss": 0.3989, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6525, | |
| "grad_norm": 0.11474955826997757, | |
| "learning_rate": 9.771419290172773e-05, | |
| "loss": 0.3374, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.655, | |
| "grad_norm": 0.11670063436031342, | |
| "learning_rate": 9.648930010205619e-05, | |
| "loss": 0.3343, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.6575, | |
| "grad_norm": 0.15385080873966217, | |
| "learning_rate": 9.526848234935704e-05, | |
| "loss": 0.3432, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.13441519439220428, | |
| "learning_rate": 9.405183261362863e-05, | |
| "loss": 0.3116, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.6625, | |
| "grad_norm": 0.14772167801856995, | |
| "learning_rate": 9.283944354745888e-05, | |
| "loss": 0.3613, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.665, | |
| "grad_norm": 0.12146154791116714, | |
| "learning_rate": 9.163140747896907e-05, | |
| "loss": 0.3411, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.6675, | |
| "grad_norm": 0.1333102583885193, | |
| "learning_rate": 9.042781640478291e-05, | |
| "loss": 0.396, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.12051521986722946, | |
| "learning_rate": 8.922876198302062e-05, | |
| "loss": 0.3837, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.6725, | |
| "grad_norm": 0.12071400880813599, | |
| "learning_rate": 8.803433552631874e-05, | |
| "loss": 0.354, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 0.11258620023727417, | |
| "learning_rate": 8.684462799487635e-05, | |
| "loss": 0.3197, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6775, | |
| "grad_norm": 0.11908067762851715, | |
| "learning_rate": 8.565972998952814e-05, | |
| "loss": 0.377, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.1252991259098053, | |
| "learning_rate": 8.447973174484469e-05, | |
| "loss": 0.3438, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6825, | |
| "grad_norm": 0.12832245230674744, | |
| "learning_rate": 8.330472312226091e-05, | |
| "loss": 0.346, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.685, | |
| "grad_norm": 0.1396942287683487, | |
| "learning_rate": 8.213479360323258e-05, | |
| "loss": 0.3886, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 0.12938210368156433, | |
| "learning_rate": 8.097003228242225e-05, | |
| "loss": 0.3699, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.12459377944469452, | |
| "learning_rate": 7.9810527860914e-05, | |
| "loss": 0.3892, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6925, | |
| "grad_norm": 0.1360333263874054, | |
| "learning_rate": 7.86563686394587e-05, | |
| "loss": 0.3423, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.695, | |
| "grad_norm": 0.1357765644788742, | |
| "learning_rate": 7.750764251174963e-05, | |
| "loss": 0.408, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6975, | |
| "grad_norm": 0.14453718066215515, | |
| "learning_rate": 7.636443695772887e-05, | |
| "loss": 0.3398, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.11541519314050674, | |
| "learning_rate": 7.522683903692547e-05, | |
| "loss": 0.4203, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7025, | |
| "grad_norm": 0.13344840705394745, | |
| "learning_rate": 7.409493538182545e-05, | |
| "loss": 0.3694, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.705, | |
| "grad_norm": 0.13069866597652435, | |
| "learning_rate": 7.296881219127452e-05, | |
| "loss": 0.3889, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7075, | |
| "grad_norm": 0.12457838654518127, | |
| "learning_rate": 7.184855522391359e-05, | |
| "loss": 0.3342, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.11990659683942795, | |
| "learning_rate": 7.073424979164794e-05, | |
| "loss": 0.3855, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7125, | |
| "grad_norm": 0.1389523446559906, | |
| "learning_rate": 6.962598075315046e-05, | |
| "loss": 0.3943, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.715, | |
| "grad_norm": 0.14108599722385406, | |
| "learning_rate": 6.852383250739938e-05, | |
| "loss": 0.388, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7175, | |
| "grad_norm": 0.1342005580663681, | |
| "learning_rate": 6.742788898725065e-05, | |
| "loss": 0.3602, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.13516324758529663, | |
| "learning_rate": 6.633823365304648e-05, | |
| "loss": 0.3935, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7225, | |
| "grad_norm": 0.1302197426557541, | |
| "learning_rate": 6.52549494862593e-05, | |
| "loss": 0.3618, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 0.12428996711969376, | |
| "learning_rate": 6.417811898317259e-05, | |
| "loss": 0.3338, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7275, | |
| "grad_norm": 0.11249776184558868, | |
| "learning_rate": 6.31078241485982e-05, | |
| "loss": 0.3819, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.1359994113445282, | |
| "learning_rate": 6.204414648963159e-05, | |
| "loss": 0.3356, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7325, | |
| "grad_norm": 0.1118568629026413, | |
| "learning_rate": 6.098716700944479e-05, | |
| "loss": 0.3223, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.735, | |
| "grad_norm": 0.12038140743970871, | |
| "learning_rate": 5.993696620111741e-05, | |
| "loss": 0.3481, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7375, | |
| "grad_norm": 0.12787550687789917, | |
| "learning_rate": 5.889362404150703e-05, | |
| "loss": 0.3766, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.12134893983602524, | |
| "learning_rate": 5.7857219985158506e-05, | |
| "loss": 0.2916, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.7425, | |
| "grad_norm": 0.1274223029613495, | |
| "learning_rate": 5.682783295825345e-05, | |
| "loss": 0.3095, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.745, | |
| "grad_norm": 0.11817299574613571, | |
| "learning_rate": 5.580554135259932e-05, | |
| "loss": 0.3422, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.7475, | |
| "grad_norm": 0.1348387748003006, | |
| "learning_rate": 5.479042301965987e-05, | |
| "loss": 0.4044, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.14032681286334991, | |
| "learning_rate": 5.378255526462631e-05, | |
| "loss": 0.337, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7525, | |
| "grad_norm": 0.1196574866771698, | |
| "learning_rate": 5.2782014840530366e-05, | |
| "loss": 0.3638, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.755, | |
| "grad_norm": 0.1307535171508789, | |
| "learning_rate": 5.178887794239904e-05, | |
| "loss": 0.3514, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.7575, | |
| "grad_norm": 0.12303224951028824, | |
| "learning_rate": 5.080322020145224e-05, | |
| "loss": 0.3825, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.11517804116010666, | |
| "learning_rate": 4.9825116679343025e-05, | |
| "loss": 0.3474, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7625, | |
| "grad_norm": 0.1276445835828781, | |
| "learning_rate": 4.885464186244154e-05, | |
| "loss": 0.3084, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.765, | |
| "grad_norm": 0.12166495621204376, | |
| "learning_rate": 4.789186965616232e-05, | |
| "loss": 0.2949, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.7675, | |
| "grad_norm": 0.13007108867168427, | |
| "learning_rate": 4.6936873379336564e-05, | |
| "loss": 0.3336, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.12368687242269516, | |
| "learning_rate": 4.598972575862803e-05, | |
| "loss": 0.3443, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.7725, | |
| "grad_norm": 0.11817432940006256, | |
| "learning_rate": 4.5050498922995166e-05, | |
| "loss": 0.3198, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 0.13239014148712158, | |
| "learning_rate": 4.4119264398197843e-05, | |
| "loss": 0.3145, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7775, | |
| "grad_norm": 0.12305855751037598, | |
| "learning_rate": 4.319609310135054e-05, | |
| "loss": 0.3276, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.13063360750675201, | |
| "learning_rate": 4.228105533552169e-05, | |
| "loss": 0.4115, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.7825, | |
| "grad_norm": 0.12751415371894836, | |
| "learning_rate": 4.137422078437991e-05, | |
| "loss": 0.4113, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.785, | |
| "grad_norm": 0.1429520696401596, | |
| "learning_rate": 4.0475658506887136e-05, | |
| "loss": 0.3634, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.7875, | |
| "grad_norm": 0.13072626292705536, | |
| "learning_rate": 3.9585436932039846e-05, | |
| "loss": 0.3914, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.13076546788215637, | |
| "learning_rate": 3.870362385365755e-05, | |
| "loss": 0.3153, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.7925, | |
| "grad_norm": 0.11764945089817047, | |
| "learning_rate": 3.7830286425220234e-05, | |
| "loss": 0.331, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.795, | |
| "grad_norm": 0.12469421327114105, | |
| "learning_rate": 3.696549115475434e-05, | |
| "loss": 0.3667, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.7975, | |
| "grad_norm": 0.13257570564746857, | |
| "learning_rate": 3.6109303899767875e-05, | |
| "loss": 0.3775, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.1399105191230774, | |
| "learning_rate": 3.5261789862235235e-05, | |
| "loss": 0.3786, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8025, | |
| "grad_norm": 0.1299823671579361, | |
| "learning_rate": 3.442301358363163e-05, | |
| "loss": 0.3984, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.805, | |
| "grad_norm": 0.12068431079387665, | |
| "learning_rate": 3.359303894001809e-05, | |
| "loss": 0.3416, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8075, | |
| "grad_norm": 0.12825050950050354, | |
| "learning_rate": 3.277192913717717e-05, | |
| "loss": 0.3973, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.12794139981269836, | |
| "learning_rate": 3.195974670579941e-05, | |
| "loss": 0.3942, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 0.1178906112909317, | |
| "learning_rate": 3.115655349672141e-05, | |
| "loss": 0.3549, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.815, | |
| "grad_norm": 0.11859016120433807, | |
| "learning_rate": 3.036241067621575e-05, | |
| "loss": 0.3113, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8175, | |
| "grad_norm": 0.12508928775787354, | |
| "learning_rate": 2.9577378721332843e-05, | |
| "loss": 0.3802, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.1293668895959854, | |
| "learning_rate": 2.8801517415295455e-05, | |
| "loss": 0.3098, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8225, | |
| "grad_norm": 0.12039236724376678, | |
| "learning_rate": 2.8034885842945865e-05, | |
| "loss": 0.2876, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 0.14805036783218384, | |
| "learning_rate": 2.7277542386246454e-05, | |
| "loss": 0.3618, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8275, | |
| "grad_norm": 0.12638579308986664, | |
| "learning_rate": 2.6529544719833706e-05, | |
| "loss": 0.3328, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.12427478283643723, | |
| "learning_rate": 2.5790949806625838e-05, | |
| "loss": 0.3394, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8325, | |
| "grad_norm": 0.1283419132232666, | |
| "learning_rate": 2.5061813893485085e-05, | |
| "loss": 0.3392, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.835, | |
| "grad_norm": 0.12487384676933289, | |
| "learning_rate": 2.434219250693419e-05, | |
| "loss": 0.3592, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8375, | |
| "grad_norm": 0.14032793045043945, | |
| "learning_rate": 2.363214044892788e-05, | |
| "loss": 0.4099, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.10917101800441742, | |
| "learning_rate": 2.293171179267946e-05, | |
| "loss": 0.3204, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.8425, | |
| "grad_norm": 0.1253073364496231, | |
| "learning_rate": 2.2240959878542848e-05, | |
| "loss": 0.3378, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.845, | |
| "grad_norm": 0.14096981287002563, | |
| "learning_rate": 2.155993730995077e-05, | |
| "loss": 0.378, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.8475, | |
| "grad_norm": 0.12039178609848022, | |
| "learning_rate": 2.0888695949408468e-05, | |
| "loss": 0.3197, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.12723132967948914, | |
| "learning_rate": 2.0227286914544353e-05, | |
| "loss": 0.3241, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8525, | |
| "grad_norm": 0.1309029906988144, | |
| "learning_rate": 1.9575760574217147e-05, | |
| "loss": 0.3743, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.855, | |
| "grad_norm": 0.1324499100446701, | |
| "learning_rate": 1.893416654468022e-05, | |
| "loss": 0.345, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.8575, | |
| "grad_norm": 0.11905783414840698, | |
| "learning_rate": 1.8302553685802917e-05, | |
| "loss": 0.3514, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.12570443749427795, | |
| "learning_rate": 1.768097009734985e-05, | |
| "loss": 0.3791, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.8625, | |
| "grad_norm": 0.13414913415908813, | |
| "learning_rate": 1.7069463115317788e-05, | |
| "loss": 0.3575, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.865, | |
| "grad_norm": 0.1283785104751587, | |
| "learning_rate": 1.6468079308331023e-05, | |
| "loss": 0.3496, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.8675, | |
| "grad_norm": 0.11180217564105988, | |
| "learning_rate": 1.587686447409478e-05, | |
| "loss": 0.3245, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.13804157078266144, | |
| "learning_rate": 1.5295863635907667e-05, | |
| "loss": 0.367, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.8725, | |
| "grad_norm": 0.12629055976867676, | |
| "learning_rate": 1.4725121039232945e-05, | |
| "loss": 0.293, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.12774884700775146, | |
| "learning_rate": 1.4164680148329088e-05, | |
| "loss": 0.3798, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8775, | |
| "grad_norm": 0.11681339889764786, | |
| "learning_rate": 1.3614583642939718e-05, | |
| "loss": 0.3474, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.14510560035705566, | |
| "learning_rate": 1.3074873415043591e-05, | |
| "loss": 0.3999, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.8825, | |
| "grad_norm": 0.1168401762843132, | |
| "learning_rate": 1.2545590565664054e-05, | |
| "loss": 0.3398, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.885, | |
| "grad_norm": 0.1411600410938263, | |
| "learning_rate": 1.2026775401739348e-05, | |
| "loss": 0.3346, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.8875, | |
| "grad_norm": 0.12797729671001434, | |
| "learning_rate": 1.1518467433052863e-05, | |
| "loss": 0.3742, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.12946921586990356, | |
| "learning_rate": 1.1020705369224414e-05, | |
| "loss": 0.3436, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.8925, | |
| "grad_norm": 0.13285613059997559, | |
| "learning_rate": 1.0533527116762296e-05, | |
| "loss": 0.3186, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.895, | |
| "grad_norm": 0.15213604271411896, | |
| "learning_rate": 1.005696977617666e-05, | |
| "loss": 0.3629, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.8975, | |
| "grad_norm": 0.12391404062509537, | |
| "learning_rate": 9.591069639154008e-06, | |
| "loss": 0.3421, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.11592845618724823, | |
| "learning_rate": 9.135862185793636e-06, | |
| "loss": 0.3107, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9025, | |
| "grad_norm": 0.12540902197360992, | |
| "learning_rate": 8.691382081905496e-06, | |
| "loss": 0.3605, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.905, | |
| "grad_norm": 0.14459215104579926, | |
| "learning_rate": 8.257663176370389e-06, | |
| "loss": 0.3884, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9075, | |
| "grad_norm": 0.14139464497566223, | |
| "learning_rate": 7.834738498562165e-06, | |
| "loss": 0.3728, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.12125397473573685, | |
| "learning_rate": 7.422640255832446e-06, | |
| "loss": 0.3237, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9125, | |
| "grad_norm": 0.13039612770080566, | |
| "learning_rate": 7.021399831057961e-06, | |
| "loss": 0.3055, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.915, | |
| "grad_norm": 0.1337701678276062, | |
| "learning_rate": 6.631047780250481e-06, | |
| "loss": 0.368, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9175, | |
| "grad_norm": 0.13020606338977814, | |
| "learning_rate": 6.251613830230013e-06, | |
| "loss": 0.3262, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.12915077805519104, | |
| "learning_rate": 5.883126876360872e-06, | |
| "loss": 0.3428, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9225, | |
| "grad_norm": 0.12774400413036346, | |
| "learning_rate": 5.525614980351284e-06, | |
| "loss": 0.3735, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 0.12587039172649384, | |
| "learning_rate": 5.1791053681162545e-06, | |
| "loss": 0.3402, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9275, | |
| "grad_norm": 0.12152459472417831, | |
| "learning_rate": 4.843624427704329e-06, | |
| "loss": 0.2968, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.11444247514009476, | |
| "learning_rate": 4.519197707287986e-06, | |
| "loss": 0.3448, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9325, | |
| "grad_norm": 0.12532518804073334, | |
| "learning_rate": 4.2058499132180734e-06, | |
| "loss": 0.3613, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.935, | |
| "grad_norm": 0.14186476171016693, | |
| "learning_rate": 3.903604908142266e-06, | |
| "loss": 0.2887, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.13014192879199982, | |
| "learning_rate": 3.6124857091878845e-06, | |
| "loss": 0.2679, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.1259031891822815, | |
| "learning_rate": 3.3325144862090648e-06, | |
| "loss": 0.2993, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.9425, | |
| "grad_norm": 0.12168288230895996, | |
| "learning_rate": 3.0637125600983916e-06, | |
| "loss": 0.3317, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.945, | |
| "grad_norm": 0.12291324138641357, | |
| "learning_rate": 2.8061004011632302e-06, | |
| "loss": 0.3311, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.9475, | |
| "grad_norm": 0.13629783689975739, | |
| "learning_rate": 2.5596976275668757e-06, | |
| "loss": 0.3456, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.17415851354599, | |
| "learning_rate": 2.324523003834511e-06, | |
| "loss": 0.3589, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9525, | |
| "grad_norm": 0.1330641210079193, | |
| "learning_rate": 2.100594439424269e-06, | |
| "loss": 0.3826, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.955, | |
| "grad_norm": 0.14203837513923645, | |
| "learning_rate": 1.8879289873632907e-06, | |
| "loss": 0.3807, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.9575, | |
| "grad_norm": 0.1222100704908371, | |
| "learning_rate": 1.686542842949129e-06, | |
| "loss": 0.3084, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.1441483348608017, | |
| "learning_rate": 1.4964513425163694e-06, | |
| "loss": 0.3871, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9625, | |
| "grad_norm": 0.1402144581079483, | |
| "learning_rate": 1.3176689622687474e-06, | |
| "loss": 0.3192, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.965, | |
| "grad_norm": 0.13284745812416077, | |
| "learning_rate": 1.1502093171766979e-06, | |
| "loss": 0.359, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.9675, | |
| "grad_norm": 0.1253402829170227, | |
| "learning_rate": 9.94085159940533e-07, | |
| "loss": 0.3214, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.13589312136173248, | |
| "learning_rate": 8.493083800193034e-07, | |
| "loss": 0.3524, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.9725, | |
| "grad_norm": 0.13623379170894623, | |
| "learning_rate": 7.158900027253223e-07, | |
| "loss": 0.3711, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 0.12516111135482788, | |
| "learning_rate": 5.9384018838457e-07, | |
| "loss": 0.3487, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9775, | |
| "grad_norm": 0.1211727038025856, | |
| "learning_rate": 4.831682315629304e-07, | |
| "loss": 0.3079, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.1348896622657776, | |
| "learning_rate": 3.8388256035840615e-07, | |
| "loss": 0.322, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.9825, | |
| "grad_norm": 0.12953124940395355, | |
| "learning_rate": 2.959907357592661e-07, | |
| "loss": 0.3054, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.985, | |
| "grad_norm": 0.12745600938796997, | |
| "learning_rate": 2.1949945106823909e-07, | |
| "loss": 0.3208, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.9875, | |
| "grad_norm": 0.13108642399311066, | |
| "learning_rate": 1.544145313928047e-07, | |
| "loss": 0.3641, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.12415596097707748, | |
| "learning_rate": 1.0074093320156517e-07, | |
| "loss": 0.3141, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.9925, | |
| "grad_norm": 0.12116590887308121, | |
| "learning_rate": 5.8482743946847153e-08, | |
| "loss": 0.3085, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.995, | |
| "grad_norm": 0.12617753446102142, | |
| "learning_rate": 2.764318175336733e-08, | |
| "loss": 0.316, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.9975, | |
| "grad_norm": 0.13097520172595978, | |
| "learning_rate": 8.224595173178527e-09, | |
| "loss": 0.2772, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1454041749238968, | |
| "learning_rate": 2.284630068460913e-10, | |
| "loss": 0.3226, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.629578157719552e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |