| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.6776034236804565, |
| "eval_steps": 500, |
| "global_step": 1900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001783166904422254, |
| "grad_norm": 2.0930111408233643, |
| "learning_rate": 4.999995641358869e-05, |
| "loss": 0.7967, |
| "num_input_tokens_seen": 63024, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.003566333808844508, |
| "grad_norm": 1.2970882654190063, |
| "learning_rate": 4.999982565450674e-05, |
| "loss": 0.7382, |
| "num_input_tokens_seen": 126336, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.005349500713266762, |
| "grad_norm": 0.8319762349128723, |
| "learning_rate": 4.999960772321009e-05, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 184688, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.007132667617689016, |
| "grad_norm": 0.9985227584838867, |
| "learning_rate": 4.999930262045865e-05, |
| "loss": 0.6836, |
| "num_input_tokens_seen": 245808, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00891583452211127, |
| "grad_norm": 1.065556287765503, |
| "learning_rate": 4.9998910347316286e-05, |
| "loss": 0.7561, |
| "num_input_tokens_seen": 306944, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.010699001426533523, |
| "grad_norm": 1.066805362701416, |
| "learning_rate": 4.9998430905150826e-05, |
| "loss": 0.7299, |
| "num_input_tokens_seen": 371616, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.012482168330955777, |
| "grad_norm": 1.2590147256851196, |
| "learning_rate": 4.999786429563404e-05, |
| "loss": 0.6834, |
| "num_input_tokens_seen": 435536, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.014265335235378032, |
| "grad_norm": 1.0066215991973877, |
| "learning_rate": 4.999721052074164e-05, |
| "loss": 0.6511, |
| "num_input_tokens_seen": 499328, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.016048502139800285, |
| "grad_norm": 1.0162546634674072, |
| "learning_rate": 4.99964695827533e-05, |
| "loss": 0.5992, |
| "num_input_tokens_seen": 557504, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.01783166904422254, |
| "grad_norm": 0.9829245209693909, |
| "learning_rate": 4.999564148425258e-05, |
| "loss": 0.6245, |
| "num_input_tokens_seen": 621440, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.019614835948644792, |
| "grad_norm": 0.9447645545005798, |
| "learning_rate": 4.999472622812701e-05, |
| "loss": 0.6444, |
| "num_input_tokens_seen": 685856, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.021398002853067047, |
| "grad_norm": 1.0958608388900757, |
| "learning_rate": 4.9993723817567996e-05, |
| "loss": 0.5194, |
| "num_input_tokens_seen": 748112, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.023181169757489302, |
| "grad_norm": 0.9865729808807373, |
| "learning_rate": 4.999263425607086e-05, |
| "loss": 0.5021, |
| "num_input_tokens_seen": 811008, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.024964336661911554, |
| "grad_norm": 1.2535978555679321, |
| "learning_rate": 4.9991457547434805e-05, |
| "loss": 0.6641, |
| "num_input_tokens_seen": 878272, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02674750356633381, |
| "grad_norm": 1.6020156145095825, |
| "learning_rate": 4.9990193695762914e-05, |
| "loss": 0.5479, |
| "num_input_tokens_seen": 942608, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.028530670470756064, |
| "grad_norm": 1.1668367385864258, |
| "learning_rate": 4.998884270546214e-05, |
| "loss": 0.6181, |
| "num_input_tokens_seen": 1005776, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.030313837375178315, |
| "grad_norm": 1.1580744981765747, |
| "learning_rate": 4.998740458124324e-05, |
| "loss": 0.6266, |
| "num_input_tokens_seen": 1068192, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.03209700427960057, |
| "grad_norm": 0.9773775339126587, |
| "learning_rate": 4.9985879328120846e-05, |
| "loss": 0.5088, |
| "num_input_tokens_seen": 1128592, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.033880171184022825, |
| "grad_norm": 1.4142199754714966, |
| "learning_rate": 4.9984266951413396e-05, |
| "loss": 0.5199, |
| "num_input_tokens_seen": 1194592, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.03566333808844508, |
| "grad_norm": 1.459350347518921, |
| "learning_rate": 4.998256745674308e-05, |
| "loss": 0.5855, |
| "num_input_tokens_seen": 1257744, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.037446504992867335, |
| "grad_norm": 1.118642807006836, |
| "learning_rate": 4.99807808500359e-05, |
| "loss": 0.6148, |
| "num_input_tokens_seen": 1320944, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.039229671897289584, |
| "grad_norm": 1.1180983781814575, |
| "learning_rate": 4.99789071375216e-05, |
| "loss": 0.5517, |
| "num_input_tokens_seen": 1382928, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.04101283880171184, |
| "grad_norm": 1.2651177644729614, |
| "learning_rate": 4.9976946325733654e-05, |
| "loss": 0.5959, |
| "num_input_tokens_seen": 1449408, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.042796005706134094, |
| "grad_norm": 0.9860583543777466, |
| "learning_rate": 4.997489842150924e-05, |
| "loss": 0.4779, |
| "num_input_tokens_seen": 1510752, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04457917261055635, |
| "grad_norm": 1.0358836650848389, |
| "learning_rate": 4.997276343198922e-05, |
| "loss": 0.5474, |
| "num_input_tokens_seen": 1568928, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.046362339514978604, |
| "grad_norm": 1.3108216524124146, |
| "learning_rate": 4.997054136461811e-05, |
| "loss": 0.4624, |
| "num_input_tokens_seen": 1631872, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.04814550641940086, |
| "grad_norm": 1.0577709674835205, |
| "learning_rate": 4.996823222714408e-05, |
| "loss": 0.558, |
| "num_input_tokens_seen": 1694000, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.04992867332382311, |
| "grad_norm": 0.9583589434623718, |
| "learning_rate": 4.996583602761887e-05, |
| "loss": 0.535, |
| "num_input_tokens_seen": 1752208, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.05171184022824536, |
| "grad_norm": 1.1273239850997925, |
| "learning_rate": 4.9963352774397845e-05, |
| "loss": 0.581, |
| "num_input_tokens_seen": 1809968, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.05349500713266762, |
| "grad_norm": 0.9180589914321899, |
| "learning_rate": 4.9960782476139875e-05, |
| "loss": 0.5853, |
| "num_input_tokens_seen": 1875584, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05527817403708987, |
| "grad_norm": 0.9368972778320312, |
| "learning_rate": 4.9958125141807376e-05, |
| "loss": 0.5655, |
| "num_input_tokens_seen": 1936544, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.05706134094151213, |
| "grad_norm": 1.093083143234253, |
| "learning_rate": 4.9955380780666233e-05, |
| "loss": 0.5248, |
| "num_input_tokens_seen": 1997312, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.05884450784593438, |
| "grad_norm": 1.0452104806900024, |
| "learning_rate": 4.99525494022858e-05, |
| "loss": 0.5912, |
| "num_input_tokens_seen": 2058400, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.06062767475035663, |
| "grad_norm": 1.655479073524475, |
| "learning_rate": 4.9949631016538845e-05, |
| "loss": 0.5465, |
| "num_input_tokens_seen": 2123584, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.062410841654778886, |
| "grad_norm": 1.295340895652771, |
| "learning_rate": 4.994662563360152e-05, |
| "loss": 0.6319, |
| "num_input_tokens_seen": 2187776, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.06419400855920114, |
| "grad_norm": 1.1385325193405151, |
| "learning_rate": 4.994353326395334e-05, |
| "loss": 0.6121, |
| "num_input_tokens_seen": 2248592, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06597717546362339, |
| "grad_norm": 1.2202588319778442, |
| "learning_rate": 4.994035391837713e-05, |
| "loss": 0.5926, |
| "num_input_tokens_seen": 2311472, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.06776034236804565, |
| "grad_norm": 1.1300709247589111, |
| "learning_rate": 4.9937087607958987e-05, |
| "loss": 0.5075, |
| "num_input_tokens_seen": 2374240, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0695435092724679, |
| "grad_norm": 1.0753881931304932, |
| "learning_rate": 4.993373434408825e-05, |
| "loss": 0.5187, |
| "num_input_tokens_seen": 2434864, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.07132667617689016, |
| "grad_norm": 1.0271146297454834, |
| "learning_rate": 4.993029413845746e-05, |
| "loss": 0.5777, |
| "num_input_tokens_seen": 2495712, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07310984308131241, |
| "grad_norm": 1.7475312948226929, |
| "learning_rate": 4.9926767003062316e-05, |
| "loss": 0.5091, |
| "num_input_tokens_seen": 2555184, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.07489300998573467, |
| "grad_norm": 1.1732685565948486, |
| "learning_rate": 4.992315295020163e-05, |
| "loss": 0.5594, |
| "num_input_tokens_seen": 2616736, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.07667617689015692, |
| "grad_norm": 1.1418745517730713, |
| "learning_rate": 4.991945199247728e-05, |
| "loss": 0.633, |
| "num_input_tokens_seen": 2679568, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.07845934379457917, |
| "grad_norm": 1.5812561511993408, |
| "learning_rate": 4.991566414279421e-05, |
| "loss": 0.5361, |
| "num_input_tokens_seen": 2741888, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.08024251069900143, |
| "grad_norm": 1.2565455436706543, |
| "learning_rate": 4.99117894143603e-05, |
| "loss": 0.5128, |
| "num_input_tokens_seen": 2804736, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.08202567760342368, |
| "grad_norm": 1.081152081489563, |
| "learning_rate": 4.990782782068639e-05, |
| "loss": 0.4925, |
| "num_input_tokens_seen": 2864768, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.08380884450784594, |
| "grad_norm": 1.157086730003357, |
| "learning_rate": 4.9903779375586224e-05, |
| "loss": 0.5091, |
| "num_input_tokens_seen": 2925776, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.08559201141226819, |
| "grad_norm": 1.496232032775879, |
| "learning_rate": 4.989964409317637e-05, |
| "loss": 0.5611, |
| "num_input_tokens_seen": 2984032, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08737517831669044, |
| "grad_norm": 1.5581008195877075, |
| "learning_rate": 4.989542198787619e-05, |
| "loss": 0.4574, |
| "num_input_tokens_seen": 3047024, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0891583452211127, |
| "grad_norm": 1.1673293113708496, |
| "learning_rate": 4.9891113074407816e-05, |
| "loss": 0.4982, |
| "num_input_tokens_seen": 3105552, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09094151212553495, |
| "grad_norm": 1.1178501844406128, |
| "learning_rate": 4.988671736779604e-05, |
| "loss": 0.5412, |
| "num_input_tokens_seen": 3165632, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.09272467902995721, |
| "grad_norm": 1.1773957014083862, |
| "learning_rate": 4.988223488336832e-05, |
| "loss": 0.5028, |
| "num_input_tokens_seen": 3229392, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.09450784593437946, |
| "grad_norm": 1.1285181045532227, |
| "learning_rate": 4.987766563675467e-05, |
| "loss": 0.5414, |
| "num_input_tokens_seen": 3287616, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.09629101283880172, |
| "grad_norm": 1.630057454109192, |
| "learning_rate": 4.9873009643887666e-05, |
| "loss": 0.5512, |
| "num_input_tokens_seen": 3346496, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.09807417974322397, |
| "grad_norm": 2.1637048721313477, |
| "learning_rate": 4.986826692100236e-05, |
| "loss": 0.4881, |
| "num_input_tokens_seen": 3409312, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.09985734664764621, |
| "grad_norm": 1.9481849670410156, |
| "learning_rate": 4.98634374846362e-05, |
| "loss": 0.4716, |
| "num_input_tokens_seen": 3472752, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.10164051355206848, |
| "grad_norm": 1.3725030422210693, |
| "learning_rate": 4.9858521351629005e-05, |
| "loss": 0.5286, |
| "num_input_tokens_seen": 3534032, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.10342368045649072, |
| "grad_norm": 1.5664440393447876, |
| "learning_rate": 4.985351853912292e-05, |
| "loss": 0.4985, |
| "num_input_tokens_seen": 3598336, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.10520684736091299, |
| "grad_norm": 1.3553557395935059, |
| "learning_rate": 4.984842906456231e-05, |
| "loss": 0.5768, |
| "num_input_tokens_seen": 3662144, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.10699001426533523, |
| "grad_norm": 1.2202125787734985, |
| "learning_rate": 4.984325294569372e-05, |
| "loss": 0.4933, |
| "num_input_tokens_seen": 3724048, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10877318116975748, |
| "grad_norm": 1.0455083847045898, |
| "learning_rate": 4.9837990200565834e-05, |
| "loss": 0.5675, |
| "num_input_tokens_seen": 3784320, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.11055634807417974, |
| "grad_norm": 1.5656300783157349, |
| "learning_rate": 4.983264084752939e-05, |
| "loss": 0.5315, |
| "num_input_tokens_seen": 3849040, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.11233951497860199, |
| "grad_norm": 1.4153857231140137, |
| "learning_rate": 4.98272049052371e-05, |
| "loss": 0.5444, |
| "num_input_tokens_seen": 3909552, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.11412268188302425, |
| "grad_norm": 1.9048830270767212, |
| "learning_rate": 4.982168239264364e-05, |
| "loss": 0.4808, |
| "num_input_tokens_seen": 3969120, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1159058487874465, |
| "grad_norm": 1.0821411609649658, |
| "learning_rate": 4.981607332900552e-05, |
| "loss": 0.4829, |
| "num_input_tokens_seen": 4029360, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.11768901569186876, |
| "grad_norm": 1.2863287925720215, |
| "learning_rate": 4.9810377733881065e-05, |
| "loss": 0.5273, |
| "num_input_tokens_seen": 4091296, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.11947218259629101, |
| "grad_norm": 1.3957486152648926, |
| "learning_rate": 4.98045956271303e-05, |
| "loss": 0.5443, |
| "num_input_tokens_seen": 4154304, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.12125534950071326, |
| "grad_norm": 1.1562933921813965, |
| "learning_rate": 4.979872702891495e-05, |
| "loss": 0.5046, |
| "num_input_tokens_seen": 4220400, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.12303851640513552, |
| "grad_norm": 1.1498775482177734, |
| "learning_rate": 4.979277195969829e-05, |
| "loss": 0.5393, |
| "num_input_tokens_seen": 4279408, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.12482168330955777, |
| "grad_norm": 1.2570199966430664, |
| "learning_rate": 4.978673044024514e-05, |
| "loss": 0.451, |
| "num_input_tokens_seen": 4339392, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.12660485021398002, |
| "grad_norm": 1.3947458267211914, |
| "learning_rate": 4.978060249162175e-05, |
| "loss": 0.5715, |
| "num_input_tokens_seen": 4399424, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.12838801711840228, |
| "grad_norm": 1.1799883842468262, |
| "learning_rate": 4.977438813519574e-05, |
| "loss": 0.5409, |
| "num_input_tokens_seen": 4460992, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.13017118402282454, |
| "grad_norm": 0.9736462831497192, |
| "learning_rate": 4.976808739263602e-05, |
| "loss": 0.5298, |
| "num_input_tokens_seen": 4525664, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.13195435092724678, |
| "grad_norm": 1.1682716608047485, |
| "learning_rate": 4.976170028591274e-05, |
| "loss": 0.481, |
| "num_input_tokens_seen": 4582160, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.13373751783166904, |
| "grad_norm": 1.3871419429779053, |
| "learning_rate": 4.975522683729719e-05, |
| "loss": 0.5021, |
| "num_input_tokens_seen": 4649328, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.1355206847360913, |
| "grad_norm": 1.1554944515228271, |
| "learning_rate": 4.9748667069361715e-05, |
| "loss": 0.5064, |
| "num_input_tokens_seen": 4711088, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.13730385164051356, |
| "grad_norm": 1.3844372034072876, |
| "learning_rate": 4.9742021004979656e-05, |
| "loss": 0.5516, |
| "num_input_tokens_seen": 4774864, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.1390870185449358, |
| "grad_norm": 1.4874283075332642, |
| "learning_rate": 4.9735288667325257e-05, |
| "loss": 0.4712, |
| "num_input_tokens_seen": 4834944, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.14087018544935806, |
| "grad_norm": 1.195500373840332, |
| "learning_rate": 4.97284700798736e-05, |
| "loss": 0.5326, |
| "num_input_tokens_seen": 4897264, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.14265335235378032, |
| "grad_norm": 1.1240135431289673, |
| "learning_rate": 4.97215652664005e-05, |
| "loss": 0.5958, |
| "num_input_tokens_seen": 4962208, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14443651925820256, |
| "grad_norm": 0.8974002599716187, |
| "learning_rate": 4.971457425098244e-05, |
| "loss": 0.5536, |
| "num_input_tokens_seen": 5027264, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.14621968616262482, |
| "grad_norm": 1.0974167585372925, |
| "learning_rate": 4.970749705799649e-05, |
| "loss": 0.4721, |
| "num_input_tokens_seen": 5093216, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.14800285306704708, |
| "grad_norm": 1.3087302446365356, |
| "learning_rate": 4.9700333712120195e-05, |
| "loss": 0.4383, |
| "num_input_tokens_seen": 5155296, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.14978601997146934, |
| "grad_norm": 5.880493640899658, |
| "learning_rate": 4.969308423833152e-05, |
| "loss": 0.5098, |
| "num_input_tokens_seen": 5216416, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.15156918687589158, |
| "grad_norm": 1.2446019649505615, |
| "learning_rate": 4.9685748661908756e-05, |
| "loss": 0.494, |
| "num_input_tokens_seen": 5278816, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.15335235378031384, |
| "grad_norm": 1.1921520233154297, |
| "learning_rate": 4.967832700843041e-05, |
| "loss": 0.5728, |
| "num_input_tokens_seen": 5344896, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.1551355206847361, |
| "grad_norm": 1.161622166633606, |
| "learning_rate": 4.967081930377515e-05, |
| "loss": 0.5036, |
| "num_input_tokens_seen": 5400960, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.15691868758915833, |
| "grad_norm": 1.0513135194778442, |
| "learning_rate": 4.966322557412168e-05, |
| "loss": 0.4347, |
| "num_input_tokens_seen": 5462928, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.1587018544935806, |
| "grad_norm": 1.2251578569412231, |
| "learning_rate": 4.965554584594868e-05, |
| "loss": 0.4997, |
| "num_input_tokens_seen": 5525296, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.16048502139800286, |
| "grad_norm": 1.2554380893707275, |
| "learning_rate": 4.9647780146034695e-05, |
| "loss": 0.511, |
| "num_input_tokens_seen": 5590640, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.16226818830242512, |
| "grad_norm": 2.3998403549194336, |
| "learning_rate": 4.9639928501458035e-05, |
| "loss": 0.5376, |
| "num_input_tokens_seen": 5652912, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.16405135520684735, |
| "grad_norm": 1.3643852472305298, |
| "learning_rate": 4.963199093959671e-05, |
| "loss": 0.5668, |
| "num_input_tokens_seen": 5711952, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.16583452211126962, |
| "grad_norm": 1.4717122316360474, |
| "learning_rate": 4.96239674881283e-05, |
| "loss": 0.4877, |
| "num_input_tokens_seen": 5773968, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.16761768901569188, |
| "grad_norm": 1.8179185390472412, |
| "learning_rate": 4.9615858175029884e-05, |
| "loss": 0.4669, |
| "num_input_tokens_seen": 5836064, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.1694008559201141, |
| "grad_norm": 2.963438034057617, |
| "learning_rate": 4.960766302857793e-05, |
| "loss": 0.4766, |
| "num_input_tokens_seen": 5897600, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.17118402282453637, |
| "grad_norm": 2.9000422954559326, |
| "learning_rate": 4.9599382077348205e-05, |
| "loss": 0.542, |
| "num_input_tokens_seen": 5959856, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.17296718972895864, |
| "grad_norm": 1.1453759670257568, |
| "learning_rate": 4.959101535021566e-05, |
| "loss": 0.5482, |
| "num_input_tokens_seen": 6016128, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.17475035663338087, |
| "grad_norm": 1.1614904403686523, |
| "learning_rate": 4.9582562876354346e-05, |
| "loss": 0.5361, |
| "num_input_tokens_seen": 6079664, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.17653352353780313, |
| "grad_norm": 1.3136591911315918, |
| "learning_rate": 4.95740246852373e-05, |
| "loss": 0.5131, |
| "num_input_tokens_seen": 6137568, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1783166904422254, |
| "grad_norm": 1.0961729288101196, |
| "learning_rate": 4.9565400806636447e-05, |
| "loss": 0.431, |
| "num_input_tokens_seen": 6199280, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18009985734664766, |
| "grad_norm": 1.3530110120773315, |
| "learning_rate": 4.9556691270622515e-05, |
| "loss": 0.526, |
| "num_input_tokens_seen": 6262272, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.1818830242510699, |
| "grad_norm": 1.2133769989013672, |
| "learning_rate": 4.9547896107564886e-05, |
| "loss": 0.5082, |
| "num_input_tokens_seen": 6324144, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.18366619115549215, |
| "grad_norm": 1.2528913021087646, |
| "learning_rate": 4.9539015348131526e-05, |
| "loss": 0.5343, |
| "num_input_tokens_seen": 6386096, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.18544935805991442, |
| "grad_norm": 1.4908058643341064, |
| "learning_rate": 4.953004902328887e-05, |
| "loss": 0.5408, |
| "num_input_tokens_seen": 6450704, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.18723252496433665, |
| "grad_norm": 1.0931016206741333, |
| "learning_rate": 4.9520997164301726e-05, |
| "loss": 0.53, |
| "num_input_tokens_seen": 6512512, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.1890156918687589, |
| "grad_norm": 1.317772626876831, |
| "learning_rate": 4.951185980273312e-05, |
| "loss": 0.4741, |
| "num_input_tokens_seen": 6572848, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.19079885877318117, |
| "grad_norm": 1.114240288734436, |
| "learning_rate": 4.9502636970444246e-05, |
| "loss": 0.5021, |
| "num_input_tokens_seen": 6634064, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.19258202567760344, |
| "grad_norm": 1.1686744689941406, |
| "learning_rate": 4.949332869959432e-05, |
| "loss": 0.5557, |
| "num_input_tokens_seen": 6698560, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.19436519258202567, |
| "grad_norm": 1.2107973098754883, |
| "learning_rate": 4.948393502264046e-05, |
| "loss": 0.5101, |
| "num_input_tokens_seen": 6758000, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.19614835948644793, |
| "grad_norm": 1.067867398262024, |
| "learning_rate": 4.9474455972337607e-05, |
| "loss": 0.4712, |
| "num_input_tokens_seen": 6823616, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1979315263908702, |
| "grad_norm": 1.0068106651306152, |
| "learning_rate": 4.946489158173838e-05, |
| "loss": 0.4854, |
| "num_input_tokens_seen": 6883376, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.19971469329529243, |
| "grad_norm": 1.490473747253418, |
| "learning_rate": 4.945524188419298e-05, |
| "loss": 0.5664, |
| "num_input_tokens_seen": 6943808, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.2014978601997147, |
| "grad_norm": 1.0813665390014648, |
| "learning_rate": 4.9445506913349063e-05, |
| "loss": 0.6241, |
| "num_input_tokens_seen": 7005728, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.20328102710413695, |
| "grad_norm": 1.3641761541366577, |
| "learning_rate": 4.943568670315162e-05, |
| "loss": 0.4916, |
| "num_input_tokens_seen": 7068608, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.20506419400855921, |
| "grad_norm": 1.0902137756347656, |
| "learning_rate": 4.942578128784287e-05, |
| "loss": 0.4833, |
| "num_input_tokens_seen": 7127008, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.20684736091298145, |
| "grad_norm": 1.430445909500122, |
| "learning_rate": 4.941579070196214e-05, |
| "loss": 0.422, |
| "num_input_tokens_seen": 7191776, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.2086305278174037, |
| "grad_norm": 1.6088680028915405, |
| "learning_rate": 4.940571498034572e-05, |
| "loss": 0.4913, |
| "num_input_tokens_seen": 7251536, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.21041369472182597, |
| "grad_norm": 1.3081697225570679, |
| "learning_rate": 4.939555415812678e-05, |
| "loss": 0.451, |
| "num_input_tokens_seen": 7315696, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.2121968616262482, |
| "grad_norm": 1.3625929355621338, |
| "learning_rate": 4.938530827073522e-05, |
| "loss": 0.5694, |
| "num_input_tokens_seen": 7373792, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.21398002853067047, |
| "grad_norm": 1.1833407878875732, |
| "learning_rate": 4.9374977353897566e-05, |
| "loss": 0.5647, |
| "num_input_tokens_seen": 7434464, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.21576319543509273, |
| "grad_norm": 1.3193016052246094, |
| "learning_rate": 4.936456144363681e-05, |
| "loss": 0.5739, |
| "num_input_tokens_seen": 7497328, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.21754636233951496, |
| "grad_norm": 1.4671732187271118, |
| "learning_rate": 4.935406057627234e-05, |
| "loss": 0.5399, |
| "num_input_tokens_seen": 7560816, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.21932952924393723, |
| "grad_norm": 1.0455771684646606, |
| "learning_rate": 4.9343474788419767e-05, |
| "loss": 0.4423, |
| "num_input_tokens_seen": 7623280, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.2211126961483595, |
| "grad_norm": 1.2360905408859253, |
| "learning_rate": 4.9332804116990795e-05, |
| "loss": 0.4595, |
| "num_input_tokens_seen": 7685264, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.22289586305278175, |
| "grad_norm": 1.3082692623138428, |
| "learning_rate": 4.9322048599193124e-05, |
| "loss": 0.5022, |
| "num_input_tokens_seen": 7748000, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.22467902995720399, |
| "grad_norm": 1.306279182434082, |
| "learning_rate": 4.931120827253033e-05, |
| "loss": 0.4287, |
| "num_input_tokens_seen": 7812992, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.22646219686162625, |
| "grad_norm": 1.3158313035964966, |
| "learning_rate": 4.930028317480167e-05, |
| "loss": 0.4895, |
| "num_input_tokens_seen": 7876416, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.2282453637660485, |
| "grad_norm": 1.1636604070663452, |
| "learning_rate": 4.9289273344102014e-05, |
| "loss": 0.4975, |
| "num_input_tokens_seen": 7940544, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.23002853067047074, |
| "grad_norm": 1.23000168800354, |
| "learning_rate": 4.927817881882169e-05, |
| "loss": 0.4295, |
| "num_input_tokens_seen": 7999472, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.231811697574893, |
| "grad_norm": 1.54082453250885, |
| "learning_rate": 4.9266999637646326e-05, |
| "loss": 0.5753, |
| "num_input_tokens_seen": 8061168, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.23359486447931527, |
| "grad_norm": 2.485759973526001, |
| "learning_rate": 4.925573583955676e-05, |
| "loss": 0.443, |
| "num_input_tokens_seen": 8118944, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.23537803138373753, |
| "grad_norm": 1.284912347793579, |
| "learning_rate": 4.9244387463828876e-05, |
| "loss": 0.5421, |
| "num_input_tokens_seen": 8185072, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.23716119828815976, |
| "grad_norm": 3.4624996185302734, |
| "learning_rate": 4.9232954550033484e-05, |
| "loss": 0.4099, |
| "num_input_tokens_seen": 8247616, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.23894436519258203, |
| "grad_norm": 1.1022762060165405, |
| "learning_rate": 4.922143713803613e-05, |
| "loss": 0.4784, |
| "num_input_tokens_seen": 8312240, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.2407275320970043, |
| "grad_norm": 1.1634345054626465, |
| "learning_rate": 4.920983526799705e-05, |
| "loss": 0.3882, |
| "num_input_tokens_seen": 8371088, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.24251069900142652, |
| "grad_norm": 1.4921728372573853, |
| "learning_rate": 4.919814898037095e-05, |
| "loss": 0.5662, |
| "num_input_tokens_seen": 8435264, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.24429386590584878, |
| "grad_norm": 1.2474942207336426, |
| "learning_rate": 4.918637831590689e-05, |
| "loss": 0.4169, |
| "num_input_tokens_seen": 8498960, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.24607703281027105, |
| "grad_norm": 0.9692139625549316, |
| "learning_rate": 4.917452331564816e-05, |
| "loss": 0.4681, |
| "num_input_tokens_seen": 8561168, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2478601997146933, |
| "grad_norm": 1.57968008518219, |
| "learning_rate": 4.9162584020932114e-05, |
| "loss": 0.4668, |
| "num_input_tokens_seen": 8624528, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.24964336661911554, |
| "grad_norm": 1.7983195781707764, |
| "learning_rate": 4.915056047339002e-05, |
| "loss": 0.5366, |
| "num_input_tokens_seen": 8684608, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2514265335235378, |
| "grad_norm": 1.3157538175582886, |
| "learning_rate": 4.913845271494695e-05, |
| "loss": 0.4451, |
| "num_input_tokens_seen": 8747216, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.25320970042796004, |
| "grad_norm": 1.193604588508606, |
| "learning_rate": 4.91262607878216e-05, |
| "loss": 0.5626, |
| "num_input_tokens_seen": 8807392, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2549928673323823, |
| "grad_norm": 1.0445785522460938, |
| "learning_rate": 4.911398473452616e-05, |
| "loss": 0.4848, |
| "num_input_tokens_seen": 8868496, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.25677603423680456, |
| "grad_norm": 1.8069995641708374, |
| "learning_rate": 4.910162459786617e-05, |
| "loss": 0.4672, |
| "num_input_tokens_seen": 8929056, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2585592011412268, |
| "grad_norm": 1.1339744329452515, |
| "learning_rate": 4.908918042094033e-05, |
| "loss": 0.399, |
| "num_input_tokens_seen": 8991968, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.2603423680456491, |
| "grad_norm": 1.2230961322784424, |
| "learning_rate": 4.907665224714042e-05, |
| "loss": 0.5477, |
| "num_input_tokens_seen": 9053408, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.26212553495007135, |
| "grad_norm": 1.2331055402755737, |
| "learning_rate": 4.906404012015108e-05, |
| "loss": 0.4485, |
| "num_input_tokens_seen": 9115920, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.26390870185449355, |
| "grad_norm": 1.8696657419204712, |
| "learning_rate": 4.905134408394969e-05, |
| "loss": 0.4714, |
| "num_input_tokens_seen": 9184576, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2656918687589158, |
| "grad_norm": 1.9693909883499146, |
| "learning_rate": 4.9038564182806234e-05, |
| "loss": 0.516, |
| "num_input_tokens_seen": 9247872, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.2674750356633381, |
| "grad_norm": 1.0184056758880615, |
| "learning_rate": 4.902570046128312e-05, |
| "loss": 0.4914, |
| "num_input_tokens_seen": 9310976, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.26925820256776034, |
| "grad_norm": 1.165300726890564, |
| "learning_rate": 4.9012752964235014e-05, |
| "loss": 0.4695, |
| "num_input_tokens_seen": 9372016, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.2710413694721826, |
| "grad_norm": 1.0303696393966675, |
| "learning_rate": 4.8999721736808714e-05, |
| "loss": 0.4741, |
| "num_input_tokens_seen": 9432624, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.27282453637660486, |
| "grad_norm": 1.2935962677001953, |
| "learning_rate": 4.898660682444297e-05, |
| "loss": 0.5044, |
| "num_input_tokens_seen": 9493360, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.2746077032810271, |
| "grad_norm": 1.3259665966033936, |
| "learning_rate": 4.8973408272868347e-05, |
| "loss": 0.4618, |
| "num_input_tokens_seen": 9555136, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.27639087018544933, |
| "grad_norm": 4.303719520568848, |
| "learning_rate": 4.896012612810704e-05, |
| "loss": 0.3954, |
| "num_input_tokens_seen": 9616896, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.2781740370898716, |
| "grad_norm": 1.2892228364944458, |
| "learning_rate": 4.894676043647274e-05, |
| "loss": 0.3872, |
| "num_input_tokens_seen": 9674752, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.27995720399429386, |
| "grad_norm": 1.360479474067688, |
| "learning_rate": 4.8933311244570434e-05, |
| "loss": 0.4713, |
| "num_input_tokens_seen": 9736976, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.2817403708987161, |
| "grad_norm": 1.3140631914138794, |
| "learning_rate": 4.8919778599296293e-05, |
| "loss": 0.3917, |
| "num_input_tokens_seen": 9797136, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.2835235378031384, |
| "grad_norm": 1.0723479986190796, |
| "learning_rate": 4.890616254783748e-05, |
| "loss": 0.4911, |
| "num_input_tokens_seen": 9858928, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.28530670470756064, |
| "grad_norm": 1.4321138858795166, |
| "learning_rate": 4.8892463137671963e-05, |
| "loss": 0.4682, |
| "num_input_tokens_seen": 9917776, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2870898716119829, |
| "grad_norm": 1.2900583744049072, |
| "learning_rate": 4.887868041656839e-05, |
| "loss": 0.4978, |
| "num_input_tokens_seen": 9982464, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.2888730385164051, |
| "grad_norm": 1.1396691799163818, |
| "learning_rate": 4.886481443258594e-05, |
| "loss": 0.4178, |
| "num_input_tokens_seen": 10044208, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2906562054208274, |
| "grad_norm": 1.40047025680542, |
| "learning_rate": 4.885086523407405e-05, |
| "loss": 0.455, |
| "num_input_tokens_seen": 10105968, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.29243937232524964, |
| "grad_norm": 1.263271689414978, |
| "learning_rate": 4.88368328696724e-05, |
| "loss": 0.4933, |
| "num_input_tokens_seen": 10166992, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2942225392296719, |
| "grad_norm": 1.3891979455947876, |
| "learning_rate": 4.882271738831059e-05, |
| "loss": 0.5043, |
| "num_input_tokens_seen": 10232144, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.29600570613409416, |
| "grad_norm": 1.4529082775115967, |
| "learning_rate": 4.880851883920809e-05, |
| "loss": 0.5188, |
| "num_input_tokens_seen": 10292944, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2977888730385164, |
| "grad_norm": 1.1358407735824585, |
| "learning_rate": 4.879423727187401e-05, |
| "loss": 0.5159, |
| "num_input_tokens_seen": 10354256, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.2995720399429387, |
| "grad_norm": 1.7224394083023071, |
| "learning_rate": 4.8779872736106916e-05, |
| "loss": 0.5063, |
| "num_input_tokens_seen": 10416688, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.3013552068473609, |
| "grad_norm": 1.557830810546875, |
| "learning_rate": 4.8765425281994704e-05, |
| "loss": 0.44, |
| "num_input_tokens_seen": 10477712, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.30313837375178315, |
| "grad_norm": 1.207537293434143, |
| "learning_rate": 4.8750894959914377e-05, |
| "loss": 0.457, |
| "num_input_tokens_seen": 10539120, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.3049215406562054, |
| "grad_norm": 1.2876931428909302, |
| "learning_rate": 4.873628182053191e-05, |
| "loss": 0.4583, |
| "num_input_tokens_seen": 10602400, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.3067047075606277, |
| "grad_norm": 1.5556424856185913, |
| "learning_rate": 4.872158591480206e-05, |
| "loss": 0.4462, |
| "num_input_tokens_seen": 10665920, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.30848787446504994, |
| "grad_norm": 1.2405084371566772, |
| "learning_rate": 4.870680729396815e-05, |
| "loss": 0.4229, |
| "num_input_tokens_seen": 10732768, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.3102710413694722, |
| "grad_norm": 1.3671534061431885, |
| "learning_rate": 4.869194600956195e-05, |
| "loss": 0.5017, |
| "num_input_tokens_seen": 10794368, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.31205420827389446, |
| "grad_norm": 1.0638670921325684, |
| "learning_rate": 4.867700211340347e-05, |
| "loss": 0.4751, |
| "num_input_tokens_seen": 10853408, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.31383737517831667, |
| "grad_norm": 1.2563133239746094, |
| "learning_rate": 4.8661975657600765e-05, |
| "loss": 0.4873, |
| "num_input_tokens_seen": 10918576, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.31562054208273893, |
| "grad_norm": 1.0638364553451538, |
| "learning_rate": 4.8646866694549795e-05, |
| "loss": 0.4572, |
| "num_input_tokens_seen": 10980976, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.3174037089871612, |
| "grad_norm": 1.3460172414779663, |
| "learning_rate": 4.863167527693417e-05, |
| "loss": 0.4758, |
| "num_input_tokens_seen": 11040448, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.31918687589158345, |
| "grad_norm": 1.210242509841919, |
| "learning_rate": 4.861640145772507e-05, |
| "loss": 0.5092, |
| "num_input_tokens_seen": 11104160, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.3209700427960057, |
| "grad_norm": 1.0002316236495972, |
| "learning_rate": 4.8601045290180946e-05, |
| "loss": 0.4447, |
| "num_input_tokens_seen": 11164224, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.322753209700428, |
| "grad_norm": 1.332479476928711, |
| "learning_rate": 4.858560682784744e-05, |
| "loss": 0.4335, |
| "num_input_tokens_seen": 11227376, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.32453637660485024, |
| "grad_norm": 1.223310112953186, |
| "learning_rate": 4.8570086124557116e-05, |
| "loss": 0.4156, |
| "num_input_tokens_seen": 11284704, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.32631954350927245, |
| "grad_norm": 1.439526915550232, |
| "learning_rate": 4.85544832344293e-05, |
| "loss": 0.431, |
| "num_input_tokens_seen": 11348448, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.3281027104136947, |
| "grad_norm": 1.27132248878479, |
| "learning_rate": 4.853879821186993e-05, |
| "loss": 0.4941, |
| "num_input_tokens_seen": 11406160, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.32988587731811697, |
| "grad_norm": 1.6706770658493042, |
| "learning_rate": 4.8523031111571316e-05, |
| "loss": 0.4718, |
| "num_input_tokens_seen": 11467088, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.33166904422253923, |
| "grad_norm": 1.131922960281372, |
| "learning_rate": 4.850718198851195e-05, |
| "loss": 0.4172, |
| "num_input_tokens_seen": 11532768, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3334522111269615, |
| "grad_norm": 1.1946320533752441, |
| "learning_rate": 4.849125089795634e-05, |
| "loss": 0.3736, |
| "num_input_tokens_seen": 11591984, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.33523537803138376, |
| "grad_norm": 1.2938627004623413, |
| "learning_rate": 4.8475237895454833e-05, |
| "loss": 0.462, |
| "num_input_tokens_seen": 11656624, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.33701854493580596, |
| "grad_norm": 1.3345882892608643, |
| "learning_rate": 4.845914303684336e-05, |
| "loss": 0.4584, |
| "num_input_tokens_seen": 11718256, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.3388017118402282, |
| "grad_norm": 0.8923389315605164, |
| "learning_rate": 4.844296637824329e-05, |
| "loss": 0.5339, |
| "num_input_tokens_seen": 11776080, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3405848787446505, |
| "grad_norm": 1.2611216306686401, |
| "learning_rate": 4.8426707976061226e-05, |
| "loss": 0.5625, |
| "num_input_tokens_seen": 11840768, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.34236804564907275, |
| "grad_norm": 1.1760461330413818, |
| "learning_rate": 4.84103678869888e-05, |
| "loss": 0.5043, |
| "num_input_tokens_seen": 11904208, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.344151212553495, |
| "grad_norm": 1.19206964969635, |
| "learning_rate": 4.8393946168002477e-05, |
| "loss": 0.4183, |
| "num_input_tokens_seen": 11967952, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.3459343794579173, |
| "grad_norm": 1.1648989915847778, |
| "learning_rate": 4.8377442876363364e-05, |
| "loss": 0.4095, |
| "num_input_tokens_seen": 12033136, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.34771754636233954, |
| "grad_norm": 1.2076241970062256, |
| "learning_rate": 4.8360858069617006e-05, |
| "loss": 0.4537, |
| "num_input_tokens_seen": 12097584, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.34950071326676174, |
| "grad_norm": 1.0648747682571411, |
| "learning_rate": 4.834419180559317e-05, |
| "loss": 0.3932, |
| "num_input_tokens_seen": 12156320, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.351283880171184, |
| "grad_norm": 1.228440523147583, |
| "learning_rate": 4.832744414240567e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 12218384, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.35306704707560627, |
| "grad_norm": 1.4440958499908447, |
| "learning_rate": 4.8310615138452156e-05, |
| "loss": 0.4685, |
| "num_input_tokens_seen": 12281856, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.35485021398002853, |
| "grad_norm": 1.0754982233047485, |
| "learning_rate": 4.829370485241388e-05, |
| "loss": 0.4623, |
| "num_input_tokens_seen": 12343904, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.3566333808844508, |
| "grad_norm": 1.3745994567871094, |
| "learning_rate": 4.827671334325556e-05, |
| "loss": 0.4334, |
| "num_input_tokens_seen": 12402256, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.35841654778887305, |
| "grad_norm": 1.0649508237838745, |
| "learning_rate": 4.82596406702251e-05, |
| "loss": 0.4728, |
| "num_input_tokens_seen": 12465536, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.3601997146932953, |
| "grad_norm": 1.061046838760376, |
| "learning_rate": 4.8242486892853424e-05, |
| "loss": 0.421, |
| "num_input_tokens_seen": 12530464, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.3619828815977175, |
| "grad_norm": 1.7068672180175781, |
| "learning_rate": 4.822525207095425e-05, |
| "loss": 0.4843, |
| "num_input_tokens_seen": 12593216, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.3637660485021398, |
| "grad_norm": 1.2018966674804688, |
| "learning_rate": 4.820793626462391e-05, |
| "loss": 0.4604, |
| "num_input_tokens_seen": 12655248, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.36554921540656204, |
| "grad_norm": 1.2888667583465576, |
| "learning_rate": 4.819053953424112e-05, |
| "loss": 0.427, |
| "num_input_tokens_seen": 12718048, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.3673323823109843, |
| "grad_norm": 1.3183050155639648, |
| "learning_rate": 4.817306194046675e-05, |
| "loss": 0.4415, |
| "num_input_tokens_seen": 12781536, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.36911554921540657, |
| "grad_norm": 1.7154110670089722, |
| "learning_rate": 4.815550354424365e-05, |
| "loss": 0.5193, |
| "num_input_tokens_seen": 12844336, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.37089871611982883, |
| "grad_norm": 1.3131228685379028, |
| "learning_rate": 4.813786440679642e-05, |
| "loss": 0.4078, |
| "num_input_tokens_seen": 12906288, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.3726818830242511, |
| "grad_norm": 1.1588881015777588, |
| "learning_rate": 4.81201445896312e-05, |
| "loss": 0.3672, |
| "num_input_tokens_seen": 12965200, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.3744650499286733, |
| "grad_norm": 1.5113669633865356, |
| "learning_rate": 4.810234415453545e-05, |
| "loss": 0.4896, |
| "num_input_tokens_seen": 13033248, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.37624821683309556, |
| "grad_norm": 1.4646553993225098, |
| "learning_rate": 4.808446316357773e-05, |
| "loss": 0.4772, |
| "num_input_tokens_seen": 13096752, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.3780313837375178, |
| "grad_norm": 1.9652795791625977, |
| "learning_rate": 4.80665016791075e-05, |
| "loss": 0.4468, |
| "num_input_tokens_seen": 13158992, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3798145506419401, |
| "grad_norm": 3.033592700958252, |
| "learning_rate": 4.804845976375489e-05, |
| "loss": 0.3997, |
| "num_input_tokens_seen": 13222064, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.38159771754636235, |
| "grad_norm": 1.2086786031723022, |
| "learning_rate": 4.8030337480430496e-05, |
| "loss": 0.4966, |
| "num_input_tokens_seen": 13286112, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3833808844507846, |
| "grad_norm": 1.7219619750976562, |
| "learning_rate": 4.801213489232514e-05, |
| "loss": 0.4918, |
| "num_input_tokens_seen": 13346832, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.38516405135520687, |
| "grad_norm": 1.256044864654541, |
| "learning_rate": 4.799385206290965e-05, |
| "loss": 0.4734, |
| "num_input_tokens_seen": 13408992, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.3869472182596291, |
| "grad_norm": 1.150932788848877, |
| "learning_rate": 4.7975489055934666e-05, |
| "loss": 0.3703, |
| "num_input_tokens_seen": 13469280, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.38873038516405134, |
| "grad_norm": 1.4256497621536255, |
| "learning_rate": 4.79570459354304e-05, |
| "loss": 0.5076, |
| "num_input_tokens_seen": 13533536, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.3905135520684736, |
| "grad_norm": 1.1593137979507446, |
| "learning_rate": 4.79385227657064e-05, |
| "loss": 0.4351, |
| "num_input_tokens_seen": 13594304, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.39229671897289586, |
| "grad_norm": 0.9239110350608826, |
| "learning_rate": 4.791991961135135e-05, |
| "loss": 0.4984, |
| "num_input_tokens_seen": 13657328, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3940798858773181, |
| "grad_norm": 0.999727189540863, |
| "learning_rate": 4.790123653723282e-05, |
| "loss": 0.4598, |
| "num_input_tokens_seen": 13720224, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.3958630527817404, |
| "grad_norm": 1.0658410787582397, |
| "learning_rate": 4.788247360849708e-05, |
| "loss": 0.4409, |
| "num_input_tokens_seen": 13782656, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.39764621968616265, |
| "grad_norm": 1.2038542032241821, |
| "learning_rate": 4.786363089056881e-05, |
| "loss": 0.4719, |
| "num_input_tokens_seen": 13849120, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.39942938659058486, |
| "grad_norm": 1.1782008409500122, |
| "learning_rate": 4.784470844915093e-05, |
| "loss": 0.4147, |
| "num_input_tokens_seen": 13910944, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.4012125534950071, |
| "grad_norm": 0.9827120304107666, |
| "learning_rate": 4.782570635022436e-05, |
| "loss": 0.3883, |
| "num_input_tokens_seen": 13969248, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.4029957203994294, |
| "grad_norm": 1.0276070833206177, |
| "learning_rate": 4.7806624660047744e-05, |
| "loss": 0.4337, |
| "num_input_tokens_seen": 14028112, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.40477888730385164, |
| "grad_norm": 1.923315167427063, |
| "learning_rate": 4.7787463445157286e-05, |
| "loss": 0.5135, |
| "num_input_tokens_seen": 14090320, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.4065620542082739, |
| "grad_norm": 1.3430618047714233, |
| "learning_rate": 4.7768222772366466e-05, |
| "loss": 0.5111, |
| "num_input_tokens_seen": 14151840, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.40834522111269617, |
| "grad_norm": 1.5225883722305298, |
| "learning_rate": 4.774890270876584e-05, |
| "loss": 0.5005, |
| "num_input_tokens_seen": 14213824, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.41012838801711843, |
| "grad_norm": 1.0013866424560547, |
| "learning_rate": 4.772950332172279e-05, |
| "loss": 0.6018, |
| "num_input_tokens_seen": 14278736, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.41191155492154063, |
| "grad_norm": 1.0078413486480713, |
| "learning_rate": 4.771002467888128e-05, |
| "loss": 0.3879, |
| "num_input_tokens_seen": 14339408, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.4136947218259629, |
| "grad_norm": 1.1650017499923706, |
| "learning_rate": 4.769046684816165e-05, |
| "loss": 0.4924, |
| "num_input_tokens_seen": 14399008, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.41547788873038516, |
| "grad_norm": 1.351217269897461, |
| "learning_rate": 4.767082989776034e-05, |
| "loss": 0.4104, |
| "num_input_tokens_seen": 14462656, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.4172610556348074, |
| "grad_norm": 1.3392795324325562, |
| "learning_rate": 4.76511138961497e-05, |
| "loss": 0.4629, |
| "num_input_tokens_seen": 14527568, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.4190442225392297, |
| "grad_norm": 1.3544095754623413, |
| "learning_rate": 4.763131891207771e-05, |
| "loss": 0.486, |
| "num_input_tokens_seen": 14590944, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.42082738944365194, |
| "grad_norm": 1.1842771768569946, |
| "learning_rate": 4.761144501456773e-05, |
| "loss": 0.4529, |
| "num_input_tokens_seen": 14651104, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.4226105563480742, |
| "grad_norm": 0.9588406085968018, |
| "learning_rate": 4.7591492272918344e-05, |
| "loss": 0.3739, |
| "num_input_tokens_seen": 14711344, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.4243937232524964, |
| "grad_norm": 1.1637108325958252, |
| "learning_rate": 4.7571460756703e-05, |
| "loss": 0.4772, |
| "num_input_tokens_seen": 14772656, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.4261768901569187, |
| "grad_norm": 1.225539207458496, |
| "learning_rate": 4.755135053576987e-05, |
| "loss": 0.4606, |
| "num_input_tokens_seen": 14833840, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.42796005706134094, |
| "grad_norm": 1.4021525382995605, |
| "learning_rate": 4.753116168024153e-05, |
| "loss": 0.4168, |
| "num_input_tokens_seen": 14896544, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4297432239657632, |
| "grad_norm": 1.615051507949829, |
| "learning_rate": 4.751089426051476e-05, |
| "loss": 0.4156, |
| "num_input_tokens_seen": 14956432, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.43152639087018546, |
| "grad_norm": 3.979645252227783, |
| "learning_rate": 4.749054834726029e-05, |
| "loss": 0.5188, |
| "num_input_tokens_seen": 15021296, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.4333095577746077, |
| "grad_norm": 1.3276335000991821, |
| "learning_rate": 4.7470124011422555e-05, |
| "loss": 0.4941, |
| "num_input_tokens_seen": 15080688, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.43509272467902993, |
| "grad_norm": 1.2513278722763062, |
| "learning_rate": 4.744962132421943e-05, |
| "loss": 0.4719, |
| "num_input_tokens_seen": 15141456, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.4368758915834522, |
| "grad_norm": 1.1449891328811646, |
| "learning_rate": 4.742904035714199e-05, |
| "loss": 0.4811, |
| "num_input_tokens_seen": 15202768, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.43865905848787445, |
| "grad_norm": 1.0668220520019531, |
| "learning_rate": 4.7408381181954284e-05, |
| "loss": 0.4801, |
| "num_input_tokens_seen": 15266416, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.4404422253922967, |
| "grad_norm": 1.576777458190918, |
| "learning_rate": 4.7387643870693055e-05, |
| "loss": 0.4551, |
| "num_input_tokens_seen": 15328416, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.442225392296719, |
| "grad_norm": 1.0677021741867065, |
| "learning_rate": 4.736682849566751e-05, |
| "loss": 0.3682, |
| "num_input_tokens_seen": 15387392, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.44400855920114124, |
| "grad_norm": 1.105083703994751, |
| "learning_rate": 4.734593512945904e-05, |
| "loss": 0.4721, |
| "num_input_tokens_seen": 15444928, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.4457917261055635, |
| "grad_norm": 1.1016100645065308, |
| "learning_rate": 4.7324963844920986e-05, |
| "loss": 0.4568, |
| "num_input_tokens_seen": 15505488, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4475748930099857, |
| "grad_norm": 1.4010059833526611, |
| "learning_rate": 4.7303914715178396e-05, |
| "loss": 0.5337, |
| "num_input_tokens_seen": 15566336, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.44935805991440797, |
| "grad_norm": 1.149238109588623, |
| "learning_rate": 4.728278781362777e-05, |
| "loss": 0.3965, |
| "num_input_tokens_seen": 15632768, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.45114122681883023, |
| "grad_norm": 1.4296883344650269, |
| "learning_rate": 4.7261583213936746e-05, |
| "loss": 0.5366, |
| "num_input_tokens_seen": 15694944, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.4529243937232525, |
| "grad_norm": 1.2786849737167358, |
| "learning_rate": 4.7240300990043926e-05, |
| "loss": 0.4339, |
| "num_input_tokens_seen": 15756496, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.45470756062767476, |
| "grad_norm": 1.1299382448196411, |
| "learning_rate": 4.721894121615859e-05, |
| "loss": 0.4866, |
| "num_input_tokens_seen": 15821200, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.456490727532097, |
| "grad_norm": 1.1465532779693604, |
| "learning_rate": 4.7197503966760375e-05, |
| "loss": 0.4288, |
| "num_input_tokens_seen": 15882736, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4582738944365193, |
| "grad_norm": 1.4677292108535767, |
| "learning_rate": 4.717598931659913e-05, |
| "loss": 0.443, |
| "num_input_tokens_seen": 15944560, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.4600570613409415, |
| "grad_norm": 1.8437912464141846, |
| "learning_rate": 4.7154397340694556e-05, |
| "loss": 0.4923, |
| "num_input_tokens_seen": 16006784, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.46184022824536375, |
| "grad_norm": 1.5408210754394531, |
| "learning_rate": 4.713272811433599e-05, |
| "loss": 0.4868, |
| "num_input_tokens_seen": 16068896, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.463623395149786, |
| "grad_norm": 1.1977325677871704, |
| "learning_rate": 4.711098171308214e-05, |
| "loss": 0.4781, |
| "num_input_tokens_seen": 16128640, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4654065620542083, |
| "grad_norm": 1.470975399017334, |
| "learning_rate": 4.708915821276082e-05, |
| "loss": 0.4748, |
| "num_input_tokens_seen": 16192800, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.46718972895863053, |
| "grad_norm": 1.460138201713562, |
| "learning_rate": 4.706725768946866e-05, |
| "loss": 0.5107, |
| "num_input_tokens_seen": 16251248, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.4689728958630528, |
| "grad_norm": 1.2103915214538574, |
| "learning_rate": 4.7045280219570896e-05, |
| "loss": 0.4768, |
| "num_input_tokens_seen": 16314704, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.47075606276747506, |
| "grad_norm": 1.1669901609420776, |
| "learning_rate": 4.702322587970104e-05, |
| "loss": 0.4624, |
| "num_input_tokens_seen": 16375792, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.47253922967189727, |
| "grad_norm": 1.1790727376937866, |
| "learning_rate": 4.700109474676064e-05, |
| "loss": 0.4735, |
| "num_input_tokens_seen": 16438672, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.4743223965763195, |
| "grad_norm": 1.019875407218933, |
| "learning_rate": 4.697888689791906e-05, |
| "loss": 0.3809, |
| "num_input_tokens_seen": 16498896, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4761055634807418, |
| "grad_norm": 1.2999383211135864, |
| "learning_rate": 4.6956602410613115e-05, |
| "loss": 0.4421, |
| "num_input_tokens_seen": 16566736, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.47788873038516405, |
| "grad_norm": 1.4289456605911255, |
| "learning_rate": 4.6934241362546874e-05, |
| "loss": 0.5083, |
| "num_input_tokens_seen": 16630480, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.4796718972895863, |
| "grad_norm": 1.2647002935409546, |
| "learning_rate": 4.691180383169137e-05, |
| "loss": 0.5118, |
| "num_input_tokens_seen": 16688832, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.4814550641940086, |
| "grad_norm": 1.1783503293991089, |
| "learning_rate": 4.688928989628431e-05, |
| "loss": 0.4128, |
| "num_input_tokens_seen": 16752432, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.48323823109843084, |
| "grad_norm": 1.2250187397003174, |
| "learning_rate": 4.686669963482983e-05, |
| "loss": 0.3974, |
| "num_input_tokens_seen": 16814912, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.48502139800285304, |
| "grad_norm": 1.5874429941177368, |
| "learning_rate": 4.6844033126098206e-05, |
| "loss": 0.5244, |
| "num_input_tokens_seen": 16875696, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.4868045649072753, |
| "grad_norm": 1.5424435138702393, |
| "learning_rate": 4.682129044912558e-05, |
| "loss": 0.3909, |
| "num_input_tokens_seen": 16934768, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.48858773181169757, |
| "grad_norm": 1.395538568496704, |
| "learning_rate": 4.679847168321368e-05, |
| "loss": 0.4208, |
| "num_input_tokens_seen": 16994192, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.49037089871611983, |
| "grad_norm": 1.3311400413513184, |
| "learning_rate": 4.677557690792956e-05, |
| "loss": 0.5148, |
| "num_input_tokens_seen": 17055952, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.4921540656205421, |
| "grad_norm": 1.0483784675598145, |
| "learning_rate": 4.6752606203105314e-05, |
| "loss": 0.4838, |
| "num_input_tokens_seen": 17118352, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.49393723252496435, |
| "grad_norm": 1.4240469932556152, |
| "learning_rate": 4.6729559648837777e-05, |
| "loss": 0.4676, |
| "num_input_tokens_seen": 17181856, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.4957203994293866, |
| "grad_norm": 1.1497527360916138, |
| "learning_rate": 4.6706437325488285e-05, |
| "loss": 0.4607, |
| "num_input_tokens_seen": 17239040, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.4975035663338088, |
| "grad_norm": 1.324589490890503, |
| "learning_rate": 4.6683239313682356e-05, |
| "loss": 0.3867, |
| "num_input_tokens_seen": 17300096, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.4992867332382311, |
| "grad_norm": 1.401481032371521, |
| "learning_rate": 4.6659965694309446e-05, |
| "loss": 0.477, |
| "num_input_tokens_seen": 17367088, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5010699001426534, |
| "grad_norm": 1.0556763410568237, |
| "learning_rate": 4.6636616548522637e-05, |
| "loss": 0.4092, |
| "num_input_tokens_seen": 17427648, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.5028530670470756, |
| "grad_norm": 1.5187320709228516, |
| "learning_rate": 4.661319195773837e-05, |
| "loss": 0.4266, |
| "num_input_tokens_seen": 17491664, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.5046362339514978, |
| "grad_norm": 1.2626229524612427, |
| "learning_rate": 4.658969200363614e-05, |
| "loss": 0.5192, |
| "num_input_tokens_seen": 17553312, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.5064194008559201, |
| "grad_norm": 1.3596255779266357, |
| "learning_rate": 4.6566116768158254e-05, |
| "loss": 0.4983, |
| "num_input_tokens_seen": 17614656, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.5082025677603423, |
| "grad_norm": 1.131866455078125, |
| "learning_rate": 4.6542466333509496e-05, |
| "loss": 0.4593, |
| "num_input_tokens_seen": 17673104, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.5099857346647646, |
| "grad_norm": 1.1720597743988037, |
| "learning_rate": 4.651874078215688e-05, |
| "loss": 0.3885, |
| "num_input_tokens_seen": 17733920, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.5117689015691869, |
| "grad_norm": 1.1201550960540771, |
| "learning_rate": 4.6494940196829326e-05, |
| "loss": 0.4661, |
| "num_input_tokens_seen": 17795024, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.5135520684736091, |
| "grad_norm": 1.4359281063079834, |
| "learning_rate": 4.647106466051741e-05, |
| "loss": 0.4327, |
| "num_input_tokens_seen": 17856080, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.5153352353780314, |
| "grad_norm": 1.2126119136810303, |
| "learning_rate": 4.644711425647305e-05, |
| "loss": 0.4281, |
| "num_input_tokens_seen": 17918592, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.5171184022824536, |
| "grad_norm": 1.1998052597045898, |
| "learning_rate": 4.642308906820921e-05, |
| "loss": 0.4234, |
| "num_input_tokens_seen": 17985056, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.5189015691868759, |
| "grad_norm": 1.2513782978057861, |
| "learning_rate": 4.6398989179499635e-05, |
| "loss": 0.4952, |
| "num_input_tokens_seen": 18047856, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.5206847360912982, |
| "grad_norm": 1.5451606512069702, |
| "learning_rate": 4.637481467437854e-05, |
| "loss": 0.4061, |
| "num_input_tokens_seen": 18110608, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5224679029957204, |
| "grad_norm": 1.280383586883545, |
| "learning_rate": 4.635056563714031e-05, |
| "loss": 0.4709, |
| "num_input_tokens_seen": 18170192, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.5242510699001427, |
| "grad_norm": 1.536872386932373, |
| "learning_rate": 4.632624215233924e-05, |
| "loss": 0.5166, |
| "num_input_tokens_seen": 18234512, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.526034236804565, |
| "grad_norm": 1.1344192028045654, |
| "learning_rate": 4.6301844304789185e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 18297872, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.5278174037089871, |
| "grad_norm": 1.2558397054672241, |
| "learning_rate": 4.6277372179563336e-05, |
| "loss": 0.4426, |
| "num_input_tokens_seen": 18360688, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.5296005706134094, |
| "grad_norm": 1.3379613161087036, |
| "learning_rate": 4.625282586199384e-05, |
| "loss": 0.4684, |
| "num_input_tokens_seen": 18421600, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.5313837375178316, |
| "grad_norm": 1.471182942390442, |
| "learning_rate": 4.622820543767159e-05, |
| "loss": 0.3746, |
| "num_input_tokens_seen": 18482608, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.5331669044222539, |
| "grad_norm": 1.147135853767395, |
| "learning_rate": 4.6203510992445844e-05, |
| "loss": 0.3896, |
| "num_input_tokens_seen": 18542720, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.5349500713266762, |
| "grad_norm": 1.6015293598175049, |
| "learning_rate": 4.617874261242399e-05, |
| "loss": 0.4613, |
| "num_input_tokens_seen": 18604304, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5367332382310984, |
| "grad_norm": 1.1261463165283203, |
| "learning_rate": 4.615390038397121e-05, |
| "loss": 0.4636, |
| "num_input_tokens_seen": 18666336, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.5385164051355207, |
| "grad_norm": 1.1836202144622803, |
| "learning_rate": 4.612898439371019e-05, |
| "loss": 0.4072, |
| "num_input_tokens_seen": 18724912, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.5402995720399429, |
| "grad_norm": 1.108585238456726, |
| "learning_rate": 4.6103994728520815e-05, |
| "loss": 0.3483, |
| "num_input_tokens_seen": 18786352, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.5420827389443652, |
| "grad_norm": 1.3794957399368286, |
| "learning_rate": 4.607893147553989e-05, |
| "loss": 0.4259, |
| "num_input_tokens_seen": 18851488, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5438659058487875, |
| "grad_norm": 1.4083433151245117, |
| "learning_rate": 4.605379472216076e-05, |
| "loss": 0.4364, |
| "num_input_tokens_seen": 18915008, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.5456490727532097, |
| "grad_norm": 1.3088963031768799, |
| "learning_rate": 4.602858455603313e-05, |
| "loss": 0.4098, |
| "num_input_tokens_seen": 18976256, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.547432239657632, |
| "grad_norm": 1.3022725582122803, |
| "learning_rate": 4.600330106506263e-05, |
| "loss": 0.4449, |
| "num_input_tokens_seen": 19036560, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.5492154065620543, |
| "grad_norm": 1.7286397218704224, |
| "learning_rate": 4.597794433741061e-05, |
| "loss": 0.5088, |
| "num_input_tokens_seen": 19097568, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5509985734664765, |
| "grad_norm": 1.4286762475967407, |
| "learning_rate": 4.5952514461493754e-05, |
| "loss": 0.445, |
| "num_input_tokens_seen": 19158592, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.5527817403708987, |
| "grad_norm": 1.2713367938995361, |
| "learning_rate": 4.5927011525983824e-05, |
| "loss": 0.3791, |
| "num_input_tokens_seen": 19215600, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5545649072753209, |
| "grad_norm": 1.3422623872756958, |
| "learning_rate": 4.590143561980736e-05, |
| "loss": 0.4897, |
| "num_input_tokens_seen": 19277184, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.5563480741797432, |
| "grad_norm": 1.278333306312561, |
| "learning_rate": 4.5875786832145287e-05, |
| "loss": 0.4426, |
| "num_input_tokens_seen": 19338032, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5581312410841655, |
| "grad_norm": 1.4938713312149048, |
| "learning_rate": 4.5850065252432706e-05, |
| "loss": 0.4246, |
| "num_input_tokens_seen": 19397040, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.5599144079885877, |
| "grad_norm": 2.4364399909973145, |
| "learning_rate": 4.582427097035854e-05, |
| "loss": 0.4777, |
| "num_input_tokens_seen": 19456144, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.56169757489301, |
| "grad_norm": 3.5539422035217285, |
| "learning_rate": 4.579840407586517e-05, |
| "loss": 0.4894, |
| "num_input_tokens_seen": 19518176, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.5634807417974322, |
| "grad_norm": 1.4036399126052856, |
| "learning_rate": 4.577246465914825e-05, |
| "loss": 0.4704, |
| "num_input_tokens_seen": 19581024, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5652639087018545, |
| "grad_norm": 0.9552262425422668, |
| "learning_rate": 4.5746452810656225e-05, |
| "loss": 0.4527, |
| "num_input_tokens_seen": 19643104, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.5670470756062768, |
| "grad_norm": 1.2145711183547974, |
| "learning_rate": 4.572036862109017e-05, |
| "loss": 0.4612, |
| "num_input_tokens_seen": 19702528, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.568830242510699, |
| "grad_norm": 1.0046789646148682, |
| "learning_rate": 4.5694212181403374e-05, |
| "loss": 0.4235, |
| "num_input_tokens_seen": 19763424, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.5706134094151213, |
| "grad_norm": 1.3540983200073242, |
| "learning_rate": 4.5667983582801064e-05, |
| "loss": 0.3833, |
| "num_input_tokens_seen": 19823200, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5723965763195435, |
| "grad_norm": 1.2544758319854736, |
| "learning_rate": 4.5641682916740084e-05, |
| "loss": 0.4586, |
| "num_input_tokens_seen": 19883888, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.5741797432239658, |
| "grad_norm": 1.1667801141738892, |
| "learning_rate": 4.5615310274928556e-05, |
| "loss": 0.5969, |
| "num_input_tokens_seen": 19949840, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5759629101283881, |
| "grad_norm": 0.9844037294387817, |
| "learning_rate": 4.5588865749325594e-05, |
| "loss": 0.3798, |
| "num_input_tokens_seen": 20014640, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.5777460770328102, |
| "grad_norm": 1.3161027431488037, |
| "learning_rate": 4.556234943214095e-05, |
| "loss": 0.4234, |
| "num_input_tokens_seen": 20077008, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5795292439372325, |
| "grad_norm": 1.1113629341125488, |
| "learning_rate": 4.5535761415834724e-05, |
| "loss": 0.4714, |
| "num_input_tokens_seen": 20141488, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.5813124108416547, |
| "grad_norm": 1.3117053508758545, |
| "learning_rate": 4.550910179311699e-05, |
| "loss": 0.5514, |
| "num_input_tokens_seen": 20206016, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.583095577746077, |
| "grad_norm": 1.151132345199585, |
| "learning_rate": 4.5482370656947554e-05, |
| "loss": 0.4626, |
| "num_input_tokens_seen": 20270880, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.5848787446504993, |
| "grad_norm": 2.0122318267822266, |
| "learning_rate": 4.5455568100535545e-05, |
| "loss": 0.4758, |
| "num_input_tokens_seen": 20334448, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.5866619115549215, |
| "grad_norm": 1.6800963878631592, |
| "learning_rate": 4.542869421733915e-05, |
| "loss": 0.4178, |
| "num_input_tokens_seen": 20398480, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.5884450784593438, |
| "grad_norm": 1.4573643207550049, |
| "learning_rate": 4.540174910106526e-05, |
| "loss": 0.4314, |
| "num_input_tokens_seen": 20458128, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5902282453637661, |
| "grad_norm": 1.1499691009521484, |
| "learning_rate": 4.537473284566914e-05, |
| "loss": 0.4182, |
| "num_input_tokens_seen": 20521840, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.5920114122681883, |
| "grad_norm": 1.1684014797210693, |
| "learning_rate": 4.5347645545354136e-05, |
| "loss": 0.3945, |
| "num_input_tokens_seen": 20582304, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5937945791726106, |
| "grad_norm": 1.358035683631897, |
| "learning_rate": 4.532048729457128e-05, |
| "loss": 0.4674, |
| "num_input_tokens_seen": 20642656, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.5955777460770328, |
| "grad_norm": 1.285057783126831, |
| "learning_rate": 4.5293258188019055e-05, |
| "loss": 0.4027, |
| "num_input_tokens_seen": 20709664, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5973609129814551, |
| "grad_norm": 1.0107051134109497, |
| "learning_rate": 4.526595832064296e-05, |
| "loss": 0.4402, |
| "num_input_tokens_seen": 20769888, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.5991440798858774, |
| "grad_norm": 1.144665241241455, |
| "learning_rate": 4.523858778763528e-05, |
| "loss": 0.4725, |
| "num_input_tokens_seen": 20834912, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.6009272467902995, |
| "grad_norm": 1.5452603101730347, |
| "learning_rate": 4.521114668443464e-05, |
| "loss": 0.4413, |
| "num_input_tokens_seen": 20896784, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.6027104136947218, |
| "grad_norm": 1.0601692199707031, |
| "learning_rate": 4.518363510672583e-05, |
| "loss": 0.4758, |
| "num_input_tokens_seen": 20954224, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.604493580599144, |
| "grad_norm": 1.6150104999542236, |
| "learning_rate": 4.515605315043928e-05, |
| "loss": 0.4027, |
| "num_input_tokens_seen": 21019760, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.6062767475035663, |
| "grad_norm": 1.3952018022537231, |
| "learning_rate": 4.512840091175089e-05, |
| "loss": 0.4497, |
| "num_input_tokens_seen": 21081952, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6080599144079886, |
| "grad_norm": 1.6579699516296387, |
| "learning_rate": 4.5100678487081614e-05, |
| "loss": 0.4343, |
| "num_input_tokens_seen": 21145680, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.6098430813124108, |
| "grad_norm": 1.5067193508148193, |
| "learning_rate": 4.507288597309711e-05, |
| "loss": 0.4142, |
| "num_input_tokens_seen": 21206048, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.6116262482168331, |
| "grad_norm": 1.2458901405334473, |
| "learning_rate": 4.504502346670748e-05, |
| "loss": 0.5092, |
| "num_input_tokens_seen": 21269520, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.6134094151212554, |
| "grad_norm": 1.33489990234375, |
| "learning_rate": 4.5017091065066837e-05, |
| "loss": 0.4563, |
| "num_input_tokens_seen": 21331136, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6151925820256776, |
| "grad_norm": 1.4016698598861694, |
| "learning_rate": 4.4989088865573035e-05, |
| "loss": 0.3743, |
| "num_input_tokens_seen": 21392496, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.6169757489300999, |
| "grad_norm": 1.5638152360916138, |
| "learning_rate": 4.496101696586732e-05, |
| "loss": 0.4823, |
| "num_input_tokens_seen": 21455504, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.6187589158345221, |
| "grad_norm": 1.2184085845947266, |
| "learning_rate": 4.4932875463833944e-05, |
| "loss": 0.4219, |
| "num_input_tokens_seen": 21518800, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.6205420827389444, |
| "grad_norm": 1.5745280981063843, |
| "learning_rate": 4.490466445759988e-05, |
| "loss": 0.506, |
| "num_input_tokens_seen": 21579120, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.6223252496433667, |
| "grad_norm": 1.4783879518508911, |
| "learning_rate": 4.487638404553445e-05, |
| "loss": 0.4638, |
| "num_input_tokens_seen": 21638528, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.6241084165477889, |
| "grad_norm": 1.4319891929626465, |
| "learning_rate": 4.484803432624899e-05, |
| "loss": 0.434, |
| "num_input_tokens_seen": 21703664, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6258915834522111, |
| "grad_norm": 1.3542821407318115, |
| "learning_rate": 4.48196153985965e-05, |
| "loss": 0.4472, |
| "num_input_tokens_seen": 21764336, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.6276747503566333, |
| "grad_norm": 1.1602082252502441, |
| "learning_rate": 4.4791127361671304e-05, |
| "loss": 0.3541, |
| "num_input_tokens_seen": 21825392, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6294579172610556, |
| "grad_norm": 1.6145776510238647, |
| "learning_rate": 4.476257031480871e-05, |
| "loss": 0.4401, |
| "num_input_tokens_seen": 21886848, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.6312410841654779, |
| "grad_norm": 1.1257821321487427, |
| "learning_rate": 4.4733944357584644e-05, |
| "loss": 0.5242, |
| "num_input_tokens_seen": 21951680, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.6330242510699001, |
| "grad_norm": 1.4322980642318726, |
| "learning_rate": 4.470524958981534e-05, |
| "loss": 0.4926, |
| "num_input_tokens_seen": 22016624, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.6348074179743224, |
| "grad_norm": 1.255799651145935, |
| "learning_rate": 4.4676486111556936e-05, |
| "loss": 0.4128, |
| "num_input_tokens_seen": 22079040, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6365905848787446, |
| "grad_norm": 1.157120943069458, |
| "learning_rate": 4.46476540231052e-05, |
| "loss": 0.3521, |
| "num_input_tokens_seen": 22142400, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.6383737517831669, |
| "grad_norm": 1.5262624025344849, |
| "learning_rate": 4.461875342499509e-05, |
| "loss": 0.4028, |
| "num_input_tokens_seen": 22199136, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.6401569186875892, |
| "grad_norm": 1.7937567234039307, |
| "learning_rate": 4.458978441800048e-05, |
| "loss": 0.4126, |
| "num_input_tokens_seen": 22260608, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.6419400855920114, |
| "grad_norm": 1.3475735187530518, |
| "learning_rate": 4.456074710313378e-05, |
| "loss": 0.4692, |
| "num_input_tokens_seen": 22322272, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6437232524964337, |
| "grad_norm": 1.2804908752441406, |
| "learning_rate": 4.4531641581645576e-05, |
| "loss": 0.4931, |
| "num_input_tokens_seen": 22384368, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.645506419400856, |
| "grad_norm": 1.2529658079147339, |
| "learning_rate": 4.4502467955024294e-05, |
| "loss": 0.386, |
| "num_input_tokens_seen": 22447888, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.6472895863052782, |
| "grad_norm": 1.3398923873901367, |
| "learning_rate": 4.447322632499581e-05, |
| "loss": 0.4522, |
| "num_input_tokens_seen": 22514704, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.6490727532097005, |
| "grad_norm": 1.320273518562317, |
| "learning_rate": 4.444391679352315e-05, |
| "loss": 0.4082, |
| "num_input_tokens_seen": 22573024, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.6508559201141226, |
| "grad_norm": 1.2203108072280884, |
| "learning_rate": 4.441453946280612e-05, |
| "loss": 0.4551, |
| "num_input_tokens_seen": 22632080, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.6526390870185449, |
| "grad_norm": 1.1191906929016113, |
| "learning_rate": 4.4385094435280873e-05, |
| "loss": 0.3873, |
| "num_input_tokens_seen": 22692192, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.6544222539229672, |
| "grad_norm": 1.249611496925354, |
| "learning_rate": 4.435558181361969e-05, |
| "loss": 0.398, |
| "num_input_tokens_seen": 22754624, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.6562054208273894, |
| "grad_norm": 1.4326295852661133, |
| "learning_rate": 4.432600170073048e-05, |
| "loss": 0.4159, |
| "num_input_tokens_seen": 22819616, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.6579885877318117, |
| "grad_norm": 1.2453666925430298, |
| "learning_rate": 4.429635419975655e-05, |
| "loss": 0.4343, |
| "num_input_tokens_seen": 22879136, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.6597717546362339, |
| "grad_norm": 1.1724647283554077, |
| "learning_rate": 4.426663941407614e-05, |
| "loss": 0.4287, |
| "num_input_tokens_seen": 22940528, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.6615549215406562, |
| "grad_norm": 1.185964822769165, |
| "learning_rate": 4.423685744730213e-05, |
| "loss": 0.3901, |
| "num_input_tokens_seen": 23004128, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.6633380884450785, |
| "grad_norm": 1.167861819267273, |
| "learning_rate": 4.420700840328162e-05, |
| "loss": 0.512, |
| "num_input_tokens_seen": 23066240, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.6651212553495007, |
| "grad_norm": 1.6327167749404907, |
| "learning_rate": 4.417709238609566e-05, |
| "loss": 0.4102, |
| "num_input_tokens_seen": 23126128, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.666904422253923, |
| "grad_norm": 1.0951687097549438, |
| "learning_rate": 4.4147109500058776e-05, |
| "loss": 0.4767, |
| "num_input_tokens_seen": 23182704, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6686875891583453, |
| "grad_norm": 1.1051822900772095, |
| "learning_rate": 4.411705984971868e-05, |
| "loss": 0.4009, |
| "num_input_tokens_seen": 23244816, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.6704707560627675, |
| "grad_norm": 1.4562581777572632, |
| "learning_rate": 4.408694353985589e-05, |
| "loss": 0.5083, |
| "num_input_tokens_seen": 23307776, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6722539229671898, |
| "grad_norm": 1.4651310443878174, |
| "learning_rate": 4.4056760675483356e-05, |
| "loss": 0.5302, |
| "num_input_tokens_seen": 23370368, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.6740370898716119, |
| "grad_norm": 1.1008446216583252, |
| "learning_rate": 4.402651136184609e-05, |
| "loss": 0.3035, |
| "num_input_tokens_seen": 23436192, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.6758202567760342, |
| "grad_norm": 1.7820332050323486, |
| "learning_rate": 4.3996195704420826e-05, |
| "loss": 0.3972, |
| "num_input_tokens_seen": 23501408, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.6776034236804565, |
| "grad_norm": 1.2907474040985107, |
| "learning_rate": 4.396581380891562e-05, |
| "loss": 0.4644, |
| "num_input_tokens_seen": 23561072, |
| "step": 1900 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 8412, |
| "num_input_tokens_seen": 23561072, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8654332564260454e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|