| { | |
| "best_global_step": 2800, | |
| "best_metric": 0.94057297706604, | |
| "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-2800", | |
| "epoch": 2.923237597911227, | |
| "eval_steps": 200, | |
| "global_step": 2800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010443864229765013, | |
| "grad_norm": 10.195783615112305, | |
| "learning_rate": 9.000000000000001e-07, | |
| "loss": 3.4273, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020887728459530026, | |
| "grad_norm": 7.810600280761719, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "loss": 3.2543, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.031331592689295036, | |
| "grad_norm": 5.423489570617676, | |
| "learning_rate": 2.9e-06, | |
| "loss": 3.0848, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04177545691906005, | |
| "grad_norm": 6.003882884979248, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 3.0378, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05221932114882506, | |
| "grad_norm": 5.395635604858398, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 2.935, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06266318537859007, | |
| "grad_norm": 5.4613823890686035, | |
| "learning_rate": 5.9e-06, | |
| "loss": 2.8095, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0731070496083551, | |
| "grad_norm": 5.638515472412109, | |
| "learning_rate": 6.9e-06, | |
| "loss": 2.8053, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0835509138381201, | |
| "grad_norm": 5.723353385925293, | |
| "learning_rate": 7.9e-06, | |
| "loss": 2.7206, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09399477806788512, | |
| "grad_norm": 6.676548480987549, | |
| "learning_rate": 8.900000000000001e-06, | |
| "loss": 2.6614, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10443864229765012, | |
| "grad_norm": 6.09738302230835, | |
| "learning_rate": 9.9e-06, | |
| "loss": 2.4909, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11488250652741515, | |
| "grad_norm": 6.762812614440918, | |
| "learning_rate": 9.967520750631542e-06, | |
| "loss": 2.4897, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12532637075718014, | |
| "grad_norm": 6.7795796394348145, | |
| "learning_rate": 9.931432695777698e-06, | |
| "loss": 2.3994, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13577023498694518, | |
| "grad_norm": 6.7266669273376465, | |
| "learning_rate": 9.895344640923855e-06, | |
| "loss": 2.395, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1462140992167102, | |
| "grad_norm": 6.685121536254883, | |
| "learning_rate": 9.859256586070011e-06, | |
| "loss": 2.3435, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1566579634464752, | |
| "grad_norm": 7.3826985359191895, | |
| "learning_rate": 9.823168531216168e-06, | |
| "loss": 2.1847, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1671018276762402, | |
| "grad_norm": 6.167830467224121, | |
| "learning_rate": 9.787080476362326e-06, | |
| "loss": 2.2339, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17754569190600522, | |
| "grad_norm": 9.586319923400879, | |
| "learning_rate": 9.750992421508482e-06, | |
| "loss": 2.1565, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18798955613577023, | |
| "grad_norm": 9.096244812011719, | |
| "learning_rate": 9.714904366654639e-06, | |
| "loss": 2.2398, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19843342036553524, | |
| "grad_norm": 9.149866104125977, | |
| "learning_rate": 9.68242511728618e-06, | |
| "loss": 2.0402, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.20887728459530025, | |
| "grad_norm": 7.662464141845703, | |
| "learning_rate": 9.646337062432336e-06, | |
| "loss": 1.853, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.20887728459530025, | |
| "eval_loss": 1.9208216667175293, | |
| "eval_runtime": 23.2773, | |
| "eval_samples_per_second": 36.559, | |
| "eval_steps_per_second": 4.597, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2193211488250653, | |
| "grad_norm": 7.436556339263916, | |
| "learning_rate": 9.610249007578492e-06, | |
| "loss": 1.9207, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2297650130548303, | |
| "grad_norm": 7.291353225708008, | |
| "learning_rate": 9.574160952724649e-06, | |
| "loss": 1.9513, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2402088772845953, | |
| "grad_norm": 7.357730865478516, | |
| "learning_rate": 9.538072897870805e-06, | |
| "loss": 1.8785, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2506527415143603, | |
| "grad_norm": 7.417892932891846, | |
| "learning_rate": 9.501984843016962e-06, | |
| "loss": 1.8304, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26109660574412535, | |
| "grad_norm": 7.092027187347412, | |
| "learning_rate": 9.46589678816312e-06, | |
| "loss": 1.7244, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27154046997389036, | |
| "grad_norm": 8.140602111816406, | |
| "learning_rate": 9.429808733309276e-06, | |
| "loss": 1.9412, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2819843342036554, | |
| "grad_norm": 6.528462886810303, | |
| "learning_rate": 9.393720678455433e-06, | |
| "loss": 1.7654, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2924281984334204, | |
| "grad_norm": 6.523935794830322, | |
| "learning_rate": 9.35763262360159e-06, | |
| "loss": 1.8401, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3028720626631854, | |
| "grad_norm": 6.85488224029541, | |
| "learning_rate": 9.321544568747746e-06, | |
| "loss": 1.8402, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3133159268929504, | |
| "grad_norm": 6.1414875984191895, | |
| "learning_rate": 9.285456513893902e-06, | |
| "loss": 1.682, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3237597911227154, | |
| "grad_norm": 7.549343109130859, | |
| "learning_rate": 9.249368459040059e-06, | |
| "loss": 1.5124, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3342036553524804, | |
| "grad_norm": 7.781056880950928, | |
| "learning_rate": 9.213280404186215e-06, | |
| "loss": 1.6474, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.34464751958224543, | |
| "grad_norm": 6.674058437347412, | |
| "learning_rate": 9.177192349332372e-06, | |
| "loss": 1.6971, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.35509138381201044, | |
| "grad_norm": 8.93964958190918, | |
| "learning_rate": 9.14110429447853e-06, | |
| "loss": 1.5545, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.36553524804177545, | |
| "grad_norm": 7.935058116912842, | |
| "learning_rate": 9.105016239624686e-06, | |
| "loss": 1.6633, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.37597911227154046, | |
| "grad_norm": 6.446653842926025, | |
| "learning_rate": 9.068928184770843e-06, | |
| "loss": 1.6209, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.38642297650130547, | |
| "grad_norm": 9.611429214477539, | |
| "learning_rate": 9.036448935402382e-06, | |
| "loss": 1.4615, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3968668407310705, | |
| "grad_norm": 8.988059043884277, | |
| "learning_rate": 9.000360880548538e-06, | |
| "loss": 1.5815, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4073107049608355, | |
| "grad_norm": 7.07716178894043, | |
| "learning_rate": 8.964272825694695e-06, | |
| "loss": 1.6719, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4177545691906005, | |
| "grad_norm": 6.789717674255371, | |
| "learning_rate": 8.928184770840851e-06, | |
| "loss": 1.4354, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4177545691906005, | |
| "eval_loss": 1.4963077306747437, | |
| "eval_runtime": 23.1593, | |
| "eval_samples_per_second": 36.746, | |
| "eval_steps_per_second": 4.62, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4281984334203655, | |
| "grad_norm": 6.676051139831543, | |
| "learning_rate": 8.89209671598701e-06, | |
| "loss": 1.695, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4386422976501306, | |
| "grad_norm": 6.711126327514648, | |
| "learning_rate": 8.856008661133166e-06, | |
| "loss": 1.6562, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4490861618798956, | |
| "grad_norm": 7.0567731857299805, | |
| "learning_rate": 8.819920606279322e-06, | |
| "loss": 1.6646, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.4595300261096606, | |
| "grad_norm": 6.944768905639648, | |
| "learning_rate": 8.783832551425479e-06, | |
| "loss": 1.4864, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4699738903394256, | |
| "grad_norm": 6.025828838348389, | |
| "learning_rate": 8.747744496571635e-06, | |
| "loss": 1.4261, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4804177545691906, | |
| "grad_norm": 7.318626880645752, | |
| "learning_rate": 8.711656441717792e-06, | |
| "loss": 1.4183, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4908616187989556, | |
| "grad_norm": 9.207475662231445, | |
| "learning_rate": 8.675568386863948e-06, | |
| "loss": 1.6308, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5013054830287206, | |
| "grad_norm": 7.152507781982422, | |
| "learning_rate": 8.639480332010105e-06, | |
| "loss": 1.5118, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5117493472584856, | |
| "grad_norm": 6.89570426940918, | |
| "learning_rate": 8.603392277156261e-06, | |
| "loss": 1.6088, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5221932114882507, | |
| "grad_norm": 8.016292572021484, | |
| "learning_rate": 8.567304222302419e-06, | |
| "loss": 1.6461, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5326370757180157, | |
| "grad_norm": 5.700518608093262, | |
| "learning_rate": 8.531216167448576e-06, | |
| "loss": 1.5497, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5430809399477807, | |
| "grad_norm": 7.127388000488281, | |
| "learning_rate": 8.495128112594732e-06, | |
| "loss": 1.4443, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5535248041775457, | |
| "grad_norm": 9.201033592224121, | |
| "learning_rate": 8.459040057740888e-06, | |
| "loss": 1.3384, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5639686684073107, | |
| "grad_norm": 9.565522193908691, | |
| "learning_rate": 8.422952002887045e-06, | |
| "loss": 1.3732, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5744125326370757, | |
| "grad_norm": 6.29095458984375, | |
| "learning_rate": 8.386863948033201e-06, | |
| "loss": 1.5768, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5848563968668408, | |
| "grad_norm": 7.279628753662109, | |
| "learning_rate": 8.350775893179358e-06, | |
| "loss": 1.5237, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5953002610966057, | |
| "grad_norm": 5.866252899169922, | |
| "learning_rate": 8.314687838325514e-06, | |
| "loss": 1.491, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6057441253263708, | |
| "grad_norm": 6.8628621101379395, | |
| "learning_rate": 8.27859978347167e-06, | |
| "loss": 1.3991, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6161879895561357, | |
| "grad_norm": 5.9102654457092285, | |
| "learning_rate": 8.242511728617829e-06, | |
| "loss": 1.3842, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6266318537859008, | |
| "grad_norm": 6.6509199142456055, | |
| "learning_rate": 8.206423673763985e-06, | |
| "loss": 1.4015, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6266318537859008, | |
| "eval_loss": 1.3116086721420288, | |
| "eval_runtime": 23.131, | |
| "eval_samples_per_second": 36.79, | |
| "eval_steps_per_second": 4.626, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6370757180156658, | |
| "grad_norm": 10.004154205322266, | |
| "learning_rate": 8.170335618910142e-06, | |
| "loss": 1.4046, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6475195822454308, | |
| "grad_norm": 6.851074695587158, | |
| "learning_rate": 8.134247564056298e-06, | |
| "loss": 1.3358, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6579634464751958, | |
| "grad_norm": 6.676290512084961, | |
| "learning_rate": 8.098159509202455e-06, | |
| "loss": 1.2649, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6684073107049608, | |
| "grad_norm": 6.854121685028076, | |
| "learning_rate": 8.062071454348611e-06, | |
| "loss": 1.4563, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6788511749347258, | |
| "grad_norm": 7.104318141937256, | |
| "learning_rate": 8.025983399494768e-06, | |
| "loss": 1.2667, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6892950391644909, | |
| "grad_norm": 6.3163957595825195, | |
| "learning_rate": 7.989895344640924e-06, | |
| "loss": 1.4532, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6997389033942559, | |
| "grad_norm": 7.5210652351379395, | |
| "learning_rate": 7.95380728978708e-06, | |
| "loss": 1.179, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7101827676240209, | |
| "grad_norm": 7.099525451660156, | |
| "learning_rate": 7.917719234933237e-06, | |
| "loss": 1.2406, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.720626631853786, | |
| "grad_norm": 6.814334392547607, | |
| "learning_rate": 7.881631180079395e-06, | |
| "loss": 1.1178, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7310704960835509, | |
| "grad_norm": 6.895037651062012, | |
| "learning_rate": 7.845543125225551e-06, | |
| "loss": 1.3291, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.741514360313316, | |
| "grad_norm": 6.383569717407227, | |
| "learning_rate": 7.809455070371708e-06, | |
| "loss": 1.2887, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7519582245430809, | |
| "grad_norm": 8.13758373260498, | |
| "learning_rate": 7.773367015517864e-06, | |
| "loss": 1.2576, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.762402088772846, | |
| "grad_norm": 5.2739362716674805, | |
| "learning_rate": 7.73727896066402e-06, | |
| "loss": 1.2835, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7728459530026109, | |
| "grad_norm": 7.7005934715271, | |
| "learning_rate": 7.701190905810177e-06, | |
| "loss": 1.2633, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.783289817232376, | |
| "grad_norm": 6.738245964050293, | |
| "learning_rate": 7.665102850956334e-06, | |
| "loss": 1.2129, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.793733681462141, | |
| "grad_norm": 5.725705146789551, | |
| "learning_rate": 7.629014796102491e-06, | |
| "loss": 1.2501, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.804177545691906, | |
| "grad_norm": 6.256880760192871, | |
| "learning_rate": 7.5929267412486475e-06, | |
| "loss": 1.2567, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.814621409921671, | |
| "grad_norm": 6.835810661315918, | |
| "learning_rate": 7.556838686394804e-06, | |
| "loss": 1.2408, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.825065274151436, | |
| "grad_norm": 5.811525821685791, | |
| "learning_rate": 7.52075063154096e-06, | |
| "loss": 1.2409, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.835509138381201, | |
| "grad_norm": 8.554996490478516, | |
| "learning_rate": 7.484662576687118e-06, | |
| "loss": 1.2575, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.835509138381201, | |
| "eval_loss": 1.2006852626800537, | |
| "eval_runtime": 23.1272, | |
| "eval_samples_per_second": 36.796, | |
| "eval_steps_per_second": 4.627, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8459530026109661, | |
| "grad_norm": 7.091418266296387, | |
| "learning_rate": 7.448574521833274e-06, | |
| "loss": 1.0256, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.856396866840731, | |
| "grad_norm": 7.5542683601379395, | |
| "learning_rate": 7.4124864669794306e-06, | |
| "loss": 1.0605, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8668407310704961, | |
| "grad_norm": 7.177179336547852, | |
| "learning_rate": 7.376398412125587e-06, | |
| "loss": 1.1747, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8772845953002611, | |
| "grad_norm": 6.226783752441406, | |
| "learning_rate": 7.3403103572717434e-06, | |
| "loss": 1.1518, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8877284595300261, | |
| "grad_norm": 7.062104225158691, | |
| "learning_rate": 7.304222302417901e-06, | |
| "loss": 1.2322, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8981723237597912, | |
| "grad_norm": 8.756392478942871, | |
| "learning_rate": 7.268134247564057e-06, | |
| "loss": 1.271, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9086161879895561, | |
| "grad_norm": 6.364953994750977, | |
| "learning_rate": 7.232046192710214e-06, | |
| "loss": 1.1853, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9190600522193212, | |
| "grad_norm": 6.629589080810547, | |
| "learning_rate": 7.19595813785637e-06, | |
| "loss": 1.2279, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9295039164490861, | |
| "grad_norm": 5.4719977378845215, | |
| "learning_rate": 7.1598700830025265e-06, | |
| "loss": 1.0789, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9399477806788512, | |
| "grad_norm": 5.824141025543213, | |
| "learning_rate": 7.123782028148684e-06, | |
| "loss": 1.0435, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9503916449086162, | |
| "grad_norm": 9.214339256286621, | |
| "learning_rate": 7.08769397329484e-06, | |
| "loss": 1.0518, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9608355091383812, | |
| "grad_norm": 6.3007941246032715, | |
| "learning_rate": 7.051605918440997e-06, | |
| "loss": 1.0297, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9712793733681462, | |
| "grad_norm": 5.431386947631836, | |
| "learning_rate": 7.015517863587153e-06, | |
| "loss": 1.1292, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9817232375979112, | |
| "grad_norm": 8.941716194152832, | |
| "learning_rate": 6.9794298087333105e-06, | |
| "loss": 1.1617, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9921671018276762, | |
| "grad_norm": 8.926597595214844, | |
| "learning_rate": 6.943341753879467e-06, | |
| "loss": 1.3497, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.002088772845953, | |
| "grad_norm": 6.556091785430908, | |
| "learning_rate": 6.907253699025623e-06, | |
| "loss": 1.176, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.012532637075718, | |
| "grad_norm": 12.022808074951172, | |
| "learning_rate": 6.87116564417178e-06, | |
| "loss": 0.9251, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.022976501305483, | |
| "grad_norm": 7.0122389793396, | |
| "learning_rate": 6.835077589317936e-06, | |
| "loss": 0.9231, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.033420365535248, | |
| "grad_norm": 6.355090141296387, | |
| "learning_rate": 6.7989895344640936e-06, | |
| "loss": 1.0371, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0438642297650131, | |
| "grad_norm": 7.36827278137207, | |
| "learning_rate": 6.76290147961025e-06, | |
| "loss": 1.0666, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0438642297650131, | |
| "eval_loss": 1.1358416080474854, | |
| "eval_runtime": 23.1354, | |
| "eval_samples_per_second": 36.783, | |
| "eval_steps_per_second": 4.625, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.054308093994778, | |
| "grad_norm": 5.1998772621154785, | |
| "learning_rate": 6.7268134247564065e-06, | |
| "loss": 0.847, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.064751958224543, | |
| "grad_norm": 6.828693866729736, | |
| "learning_rate": 6.690725369902563e-06, | |
| "loss": 1.0755, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.0751958224543081, | |
| "grad_norm": 6.408750534057617, | |
| "learning_rate": 6.65463731504872e-06, | |
| "loss": 0.9039, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.0856396866840732, | |
| "grad_norm": 7.197463035583496, | |
| "learning_rate": 6.618549260194877e-06, | |
| "loss": 0.959, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.096083550913838, | |
| "grad_norm": 5.504906177520752, | |
| "learning_rate": 6.582461205341033e-06, | |
| "loss": 0.9467, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.106527415143603, | |
| "grad_norm": 7.88711404800415, | |
| "learning_rate": 6.5463731504871896e-06, | |
| "loss": 1.0635, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1169712793733682, | |
| "grad_norm": 5.1561055183410645, | |
| "learning_rate": 6.510285095633346e-06, | |
| "loss": 0.9635, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1274151436031332, | |
| "grad_norm": 5.088303565979004, | |
| "learning_rate": 6.474197040779503e-06, | |
| "loss": 0.9605, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.137859007832898, | |
| "grad_norm": 4.595157146453857, | |
| "learning_rate": 6.43810898592566e-06, | |
| "loss": 1.0494, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.1483028720626631, | |
| "grad_norm": 6.682077884674072, | |
| "learning_rate": 6.402020931071816e-06, | |
| "loss": 0.888, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1587467362924282, | |
| "grad_norm": 9.08073902130127, | |
| "learning_rate": 6.365932876217973e-06, | |
| "loss": 1.1239, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.1691906005221933, | |
| "grad_norm": 7.480021953582764, | |
| "learning_rate": 6.329844821364129e-06, | |
| "loss": 1.0337, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.1796344647519583, | |
| "grad_norm": 5.750502109527588, | |
| "learning_rate": 6.293756766510286e-06, | |
| "loss": 1.132, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.1900783289817232, | |
| "grad_norm": 6.151831150054932, | |
| "learning_rate": 6.257668711656443e-06, | |
| "loss": 0.9985, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.2005221932114882, | |
| "grad_norm": 6.567698001861572, | |
| "learning_rate": 6.221580656802599e-06, | |
| "loss": 1.1177, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2109660574412533, | |
| "grad_norm": 6.1963887214660645, | |
| "learning_rate": 6.185492601948756e-06, | |
| "loss": 0.9328, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.2214099216710184, | |
| "grad_norm": 5.044003009796143, | |
| "learning_rate": 6.149404547094913e-06, | |
| "loss": 0.8986, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.2318537859007832, | |
| "grad_norm": 6.648619651794434, | |
| "learning_rate": 6.113316492241068e-06, | |
| "loss": 0.9905, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.2422976501305483, | |
| "grad_norm": 9.110396385192871, | |
| "learning_rate": 6.077228437387225e-06, | |
| "loss": 1.0126, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2527415143603133, | |
| "grad_norm": 8.707374572753906, | |
| "learning_rate": 6.0411403825333815e-06, | |
| "loss": 0.8611, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2527415143603133, | |
| "eval_loss": 1.0788438320159912, | |
| "eval_runtime": 23.1258, | |
| "eval_samples_per_second": 36.799, | |
| "eval_steps_per_second": 4.627, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2631853785900784, | |
| "grad_norm": 5.457468509674072, | |
| "learning_rate": 6.005052327679538e-06, | |
| "loss": 0.8327, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.2736292428198435, | |
| "grad_norm": 6.050765037536621, | |
| "learning_rate": 5.9689642728256944e-06, | |
| "loss": 1.0782, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.2840731070496083, | |
| "grad_norm": 6.254447937011719, | |
| "learning_rate": 5.932876217971852e-06, | |
| "loss": 0.9388, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2945169712793734, | |
| "grad_norm": 5.181304931640625, | |
| "learning_rate": 5.896788163118008e-06, | |
| "loss": 0.9711, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.3049608355091384, | |
| "grad_norm": 6.832638263702393, | |
| "learning_rate": 5.860700108264165e-06, | |
| "loss": 1.0333, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.3154046997389033, | |
| "grad_norm": 8.406023025512695, | |
| "learning_rate": 5.824612053410321e-06, | |
| "loss": 1.0902, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.3258485639686683, | |
| "grad_norm": 6.346268653869629, | |
| "learning_rate": 5.7885239985564775e-06, | |
| "loss": 0.8651, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.3362924281984334, | |
| "grad_norm": 8.447615623474121, | |
| "learning_rate": 5.752435943702635e-06, | |
| "loss": 1.046, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.3467362924281985, | |
| "grad_norm": 8.351264953613281, | |
| "learning_rate": 5.716347888848791e-06, | |
| "loss": 1.029, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3571801566579635, | |
| "grad_norm": 6.036417007446289, | |
| "learning_rate": 5.680259833994948e-06, | |
| "loss": 1.0089, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.3676240208877284, | |
| "grad_norm": 5.646811485290527, | |
| "learning_rate": 5.644171779141104e-06, | |
| "loss": 0.9346, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.3780678851174935, | |
| "grad_norm": 6.4004950523376465, | |
| "learning_rate": 5.608083724287261e-06, | |
| "loss": 0.9489, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.3885117493472585, | |
| "grad_norm": 5.746732234954834, | |
| "learning_rate": 5.571995669433418e-06, | |
| "loss": 0.9356, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.3989556135770234, | |
| "grad_norm": 5.3405632972717285, | |
| "learning_rate": 5.535907614579574e-06, | |
| "loss": 0.7099, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.4093994778067884, | |
| "grad_norm": 6.424510955810547, | |
| "learning_rate": 5.499819559725731e-06, | |
| "loss": 0.6955, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.4198433420365535, | |
| "grad_norm": 5.751818656921387, | |
| "learning_rate": 5.463731504871887e-06, | |
| "loss": 0.9094, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.4302872062663186, | |
| "grad_norm": 7.787084579467773, | |
| "learning_rate": 5.4276434500180445e-06, | |
| "loss": 0.9618, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.4407310704960836, | |
| "grad_norm": 6.467785835266113, | |
| "learning_rate": 5.391555395164201e-06, | |
| "loss": 0.8549, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.4511749347258487, | |
| "grad_norm": 6.860886573791504, | |
| "learning_rate": 5.3554673403103574e-06, | |
| "loss": 0.8342, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4616187989556135, | |
| "grad_norm": 5.627669811248779, | |
| "learning_rate": 5.319379285456514e-06, | |
| "loss": 0.9012, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4616187989556135, | |
| "eval_loss": 1.0375572443008423, | |
| "eval_runtime": 23.1318, | |
| "eval_samples_per_second": 36.789, | |
| "eval_steps_per_second": 4.626, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4720626631853786, | |
| "grad_norm": 7.144196033477783, | |
| "learning_rate": 5.28329123060267e-06, | |
| "loss": 0.9949, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.4825065274151437, | |
| "grad_norm": 6.208961486816406, | |
| "learning_rate": 5.247203175748828e-06, | |
| "loss": 1.0719, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.4929503916449085, | |
| "grad_norm": 7.110988140106201, | |
| "learning_rate": 5.211115120894984e-06, | |
| "loss": 0.9162, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.5033942558746736, | |
| "grad_norm": 6.903599739074707, | |
| "learning_rate": 5.1750270660411405e-06, | |
| "loss": 0.9974, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.5138381201044386, | |
| "grad_norm": 5.059232234954834, | |
| "learning_rate": 5.138939011187297e-06, | |
| "loss": 0.9602, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.5242819843342037, | |
| "grad_norm": 3.6045143604278564, | |
| "learning_rate": 5.1028509563334534e-06, | |
| "loss": 0.8448, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.5347258485639688, | |
| "grad_norm": 8.628995895385742, | |
| "learning_rate": 5.066762901479611e-06, | |
| "loss": 0.914, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.5451697127937338, | |
| "grad_norm": 7.227733612060547, | |
| "learning_rate": 5.030674846625767e-06, | |
| "loss": 0.9206, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.5556135770234987, | |
| "grad_norm": 7.930648326873779, | |
| "learning_rate": 4.994586791771924e-06, | |
| "loss": 1.1221, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5660574412532637, | |
| "grad_norm": 6.340338706970215, | |
| "learning_rate": 4.95849873691808e-06, | |
| "loss": 0.8298, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5765013054830286, | |
| "grad_norm": 5.558096408843994, | |
| "learning_rate": 4.922410682064237e-06, | |
| "loss": 0.9457, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.5869451697127936, | |
| "grad_norm": 7.608903408050537, | |
| "learning_rate": 4.886322627210394e-06, | |
| "loss": 0.9164, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.5973890339425587, | |
| "grad_norm": 9.46885871887207, | |
| "learning_rate": 4.85023457235655e-06, | |
| "loss": 0.9498, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.6078328981723238, | |
| "grad_norm": 7.6691107749938965, | |
| "learning_rate": 4.814146517502707e-06, | |
| "loss": 0.9764, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.6182767624020888, | |
| "grad_norm": 8.231538772583008, | |
| "learning_rate": 4.778058462648863e-06, | |
| "loss": 0.8314, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.628720626631854, | |
| "grad_norm": 5.868668556213379, | |
| "learning_rate": 4.7419704077950205e-06, | |
| "loss": 0.9127, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6391644908616188, | |
| "grad_norm": 7.236291885375977, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 0.9485, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.6496083550913838, | |
| "grad_norm": 6.871162414550781, | |
| "learning_rate": 4.669794298087333e-06, | |
| "loss": 1.0603, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.6600522193211487, | |
| "grad_norm": 6.303982734680176, | |
| "learning_rate": 4.63370624323349e-06, | |
| "loss": 0.945, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.6704960835509137, | |
| "grad_norm": 7.167915344238281, | |
| "learning_rate": 4.597618188379647e-06, | |
| "loss": 0.7351, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6704960835509137, | |
| "eval_loss": 1.0095878839492798, | |
| "eval_runtime": 23.1917, | |
| "eval_samples_per_second": 36.694, | |
| "eval_steps_per_second": 4.614, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6809399477806788, | |
| "grad_norm": 6.626043796539307, | |
| "learning_rate": 4.5615301335258035e-06, | |
| "loss": 0.9369, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.6913838120104439, | |
| "grad_norm": 6.645303726196289, | |
| "learning_rate": 4.52544207867196e-06, | |
| "loss": 0.8509, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.701827676240209, | |
| "grad_norm": 7.132906913757324, | |
| "learning_rate": 4.4893540238181164e-06, | |
| "loss": 0.9522, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.712271540469974, | |
| "grad_norm": 6.155941009521484, | |
| "learning_rate": 4.453265968964273e-06, | |
| "loss": 0.9662, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.722715404699739, | |
| "grad_norm": 5.0147705078125, | |
| "learning_rate": 4.41717791411043e-06, | |
| "loss": 0.8755, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.733159268929504, | |
| "grad_norm": 7.039682388305664, | |
| "learning_rate": 4.381089859256587e-06, | |
| "loss": 0.8761, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.743603133159269, | |
| "grad_norm": 8.536678314208984, | |
| "learning_rate": 4.345001804402743e-06, | |
| "loss": 0.8054, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7540469973890338, | |
| "grad_norm": 6.860482215881348, | |
| "learning_rate": 4.3089137495488995e-06, | |
| "loss": 0.9095, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.7644908616187989, | |
| "grad_norm": 6.3796563148498535, | |
| "learning_rate": 4.272825694695056e-06, | |
| "loss": 1.08, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.774934725848564, | |
| "grad_norm": 11.013219833374023, | |
| "learning_rate": 4.236737639841213e-06, | |
| "loss": 0.834, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.785378590078329, | |
| "grad_norm": 6.231220722198486, | |
| "learning_rate": 4.20064958498737e-06, | |
| "loss": 0.8488, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.795822454308094, | |
| "grad_norm": 7.019144535064697, | |
| "learning_rate": 4.164561530133526e-06, | |
| "loss": 0.7922, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.8062663185378591, | |
| "grad_norm": 6.586852550506592, | |
| "learning_rate": 4.128473475279683e-06, | |
| "loss": 0.8326, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.816710182767624, | |
| "grad_norm": 6.34022855758667, | |
| "learning_rate": 4.09238542042584e-06, | |
| "loss": 0.906, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.827154046997389, | |
| "grad_norm": 7.541686058044434, | |
| "learning_rate": 4.056297365571996e-06, | |
| "loss": 0.9029, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.837597911227154, | |
| "grad_norm": 5.867885589599609, | |
| "learning_rate": 4.020209310718153e-06, | |
| "loss": 0.8072, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.848041775456919, | |
| "grad_norm": 4.805484771728516, | |
| "learning_rate": 3.984121255864309e-06, | |
| "loss": 0.9158, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.858485639686684, | |
| "grad_norm": 5.949447154998779, | |
| "learning_rate": 3.948033201010466e-06, | |
| "loss": 0.8116, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.868929503916449, | |
| "grad_norm": 7.641289234161377, | |
| "learning_rate": 3.911945146156623e-06, | |
| "loss": 0.8824, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.8793733681462141, | |
| "grad_norm": 5.530484199523926, | |
| "learning_rate": 3.8758570913027795e-06, | |
| "loss": 0.9264, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.8793733681462141, | |
| "eval_loss": 0.9786838293075562, | |
| "eval_runtime": 23.1205, | |
| "eval_samples_per_second": 36.807, | |
| "eval_steps_per_second": 4.628, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.8898172323759792, | |
| "grad_norm": 5.981470584869385, | |
| "learning_rate": 3.839769036448936e-06, | |
| "loss": 0.7929, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.9002610966057443, | |
| "grad_norm": 8.019466400146484, | |
| "learning_rate": 3.8036809815950928e-06, | |
| "loss": 0.7616, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.9107049608355091, | |
| "grad_norm": 7.649405479431152, | |
| "learning_rate": 3.7675929267412492e-06, | |
| "loss": 0.8661, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.9211488250652742, | |
| "grad_norm": 8.259872436523438, | |
| "learning_rate": 3.7315048718874057e-06, | |
| "loss": 1.0303, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.931592689295039, | |
| "grad_norm": 5.947850227355957, | |
| "learning_rate": 3.695416817033562e-06, | |
| "loss": 0.9326, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.942036553524804, | |
| "grad_norm": 5.193607330322266, | |
| "learning_rate": 3.6593287621797186e-06, | |
| "loss": 0.8527, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.9524804177545692, | |
| "grad_norm": 6.412321090698242, | |
| "learning_rate": 3.623240707325875e-06, | |
| "loss": 0.8768, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.9629242819843342, | |
| "grad_norm": 6.859325408935547, | |
| "learning_rate": 3.587152652472032e-06, | |
| "loss": 0.8131, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.9733681462140993, | |
| "grad_norm": 4.910820484161377, | |
| "learning_rate": 3.5510645976181883e-06, | |
| "loss": 0.7232, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.9838120104438643, | |
| "grad_norm": 6.052480220794678, | |
| "learning_rate": 3.5149765427643452e-06, | |
| "loss": 0.7567, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.9942558746736292, | |
| "grad_norm": 6.609030246734619, | |
| "learning_rate": 3.4788884879105017e-06, | |
| "loss": 0.8219, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.004177545691906, | |
| "grad_norm": 4.539740562438965, | |
| "learning_rate": 3.4428004330566585e-06, | |
| "loss": 0.8498, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.014621409921671, | |
| "grad_norm": 5.981167316436768, | |
| "learning_rate": 3.406712378202815e-06, | |
| "loss": 0.7024, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.025065274151436, | |
| "grad_norm": 6.600665092468262, | |
| "learning_rate": 3.3706243233489714e-06, | |
| "loss": 0.7467, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.035509138381201, | |
| "grad_norm": 5.560609340667725, | |
| "learning_rate": 3.3345362684951283e-06, | |
| "loss": 0.7434, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.045953002610966, | |
| "grad_norm": 4.679533004760742, | |
| "learning_rate": 3.2984482136412848e-06, | |
| "loss": 0.7125, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.056396866840731, | |
| "grad_norm": 7.177086353302002, | |
| "learning_rate": 3.2623601587874416e-06, | |
| "loss": 0.6745, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.066840731070496, | |
| "grad_norm": 6.860986709594727, | |
| "learning_rate": 3.226272103933598e-06, | |
| "loss": 0.6814, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.077284595300261, | |
| "grad_norm": 8.40719223022461, | |
| "learning_rate": 3.190184049079755e-06, | |
| "loss": 0.7077, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.0877284595300263, | |
| "grad_norm": 5.830367088317871, | |
| "learning_rate": 3.1540959942259114e-06, | |
| "loss": 0.8435, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.0877284595300263, | |
| "eval_loss": 0.9753687381744385, | |
| "eval_runtime": 23.1964, | |
| "eval_samples_per_second": 36.687, | |
| "eval_steps_per_second": 4.613, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.0981723237597913, | |
| "grad_norm": 6.964289665222168, | |
| "learning_rate": 3.118007939372068e-06, | |
| "loss": 0.6964, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.108616187989556, | |
| "grad_norm": 6.764989852905273, | |
| "learning_rate": 3.0819198845182247e-06, | |
| "loss": 0.7107, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.119060052219321, | |
| "grad_norm": 7.992194652557373, | |
| "learning_rate": 3.045831829664381e-06, | |
| "loss": 0.7146, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.129503916449086, | |
| "grad_norm": 7.102138996124268, | |
| "learning_rate": 3.009743774810538e-06, | |
| "loss": 0.686, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.139947780678851, | |
| "grad_norm": 7.446751117706299, | |
| "learning_rate": 2.9736557199566945e-06, | |
| "loss": 0.7681, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.1503916449086162, | |
| "grad_norm": 7.091776371002197, | |
| "learning_rate": 2.9375676651028514e-06, | |
| "loss": 0.7674, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.1608355091383813, | |
| "grad_norm": 7.994192123413086, | |
| "learning_rate": 2.901479610249008e-06, | |
| "loss": 0.7187, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.1712793733681464, | |
| "grad_norm": 4.8329386711120605, | |
| "learning_rate": 2.8653915553951643e-06, | |
| "loss": 0.7501, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.1817232375979114, | |
| "grad_norm": 6.802753925323486, | |
| "learning_rate": 2.829303500541321e-06, | |
| "loss": 0.5658, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.192167101827676, | |
| "grad_norm": 7.07351541519165, | |
| "learning_rate": 2.7932154456874776e-06, | |
| "loss": 0.8106, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.202610966057441, | |
| "grad_norm": 6.761138916015625, | |
| "learning_rate": 2.7571273908336344e-06, | |
| "loss": 0.7434, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.213054830287206, | |
| "grad_norm": 4.220724582672119, | |
| "learning_rate": 2.721039335979791e-06, | |
| "loss": 0.7526, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.2234986945169712, | |
| "grad_norm": 6.14243745803833, | |
| "learning_rate": 2.6849512811259478e-06, | |
| "loss": 0.6486, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.2339425587467363, | |
| "grad_norm": 8.640827178955078, | |
| "learning_rate": 2.6488632262721042e-06, | |
| "loss": 0.5949, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.2443864229765014, | |
| "grad_norm": 6.576625823974609, | |
| "learning_rate": 2.6127751714182607e-06, | |
| "loss": 0.6923, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.2548302872062664, | |
| "grad_norm": 6.136504173278809, | |
| "learning_rate": 2.5766871165644175e-06, | |
| "loss": 0.6255, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.2652741514360315, | |
| "grad_norm": 7.2910847663879395, | |
| "learning_rate": 2.540599061710574e-06, | |
| "loss": 0.6843, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.275718015665796, | |
| "grad_norm": 6.936916351318359, | |
| "learning_rate": 2.504511006856731e-06, | |
| "loss": 0.6751, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.286161879895561, | |
| "grad_norm": 5.899853229522705, | |
| "learning_rate": 2.4684229520028873e-06, | |
| "loss": 0.6584, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.2966057441253263, | |
| "grad_norm": 4.891731262207031, | |
| "learning_rate": 2.4323348971490438e-06, | |
| "loss": 0.6373, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.2966057441253263, | |
| "eval_loss": 0.958043098449707, | |
| "eval_runtime": 23.1881, | |
| "eval_samples_per_second": 36.7, | |
| "eval_steps_per_second": 4.614, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.3070496083550913, | |
| "grad_norm": 6.206886291503906, | |
| "learning_rate": 2.3962468422952e-06, | |
| "loss": 0.6725, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.3174934725848564, | |
| "grad_norm": 4.663551330566406, | |
| "learning_rate": 2.360158787441357e-06, | |
| "loss": 0.5578, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.3279373368146214, | |
| "grad_norm": 6.175649166107178, | |
| "learning_rate": 2.3240707325875135e-06, | |
| "loss": 0.6835, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.3383812010443865, | |
| "grad_norm": 6.676774501800537, | |
| "learning_rate": 2.2879826777336704e-06, | |
| "loss": 0.7966, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.3488250652741516, | |
| "grad_norm": 8.847614288330078, | |
| "learning_rate": 2.251894622879827e-06, | |
| "loss": 0.7679, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.3592689295039166, | |
| "grad_norm": 6.491757869720459, | |
| "learning_rate": 2.2158065680259837e-06, | |
| "loss": 0.6274, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.3697127937336813, | |
| "grad_norm": 6.540876388549805, | |
| "learning_rate": 2.17971851317214e-06, | |
| "loss": 0.674, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.3801566579634463, | |
| "grad_norm": 7.067712306976318, | |
| "learning_rate": 2.1436304583182966e-06, | |
| "loss": 0.6716, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.3906005221932114, | |
| "grad_norm": 4.959332466125488, | |
| "learning_rate": 2.1075424034644535e-06, | |
| "loss": 0.6729, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.4010443864229765, | |
| "grad_norm": 4.016025066375732, | |
| "learning_rate": 2.07145434861061e-06, | |
| "loss": 0.6358, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.4114882506527415, | |
| "grad_norm": 4.044537544250488, | |
| "learning_rate": 2.035366293756767e-06, | |
| "loss": 0.6867, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.4219321148825066, | |
| "grad_norm": 4.88841438293457, | |
| "learning_rate": 1.9992782389029233e-06, | |
| "loss": 0.7561, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.4323759791122717, | |
| "grad_norm": 7.33749532699585, | |
| "learning_rate": 1.96319018404908e-06, | |
| "loss": 0.6579, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.4428198433420367, | |
| "grad_norm": 6.818521976470947, | |
| "learning_rate": 1.9271021291952366e-06, | |
| "loss": 0.7322, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.453263707571802, | |
| "grad_norm": 5.549405097961426, | |
| "learning_rate": 1.8910140743413932e-06, | |
| "loss": 0.634, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.4637075718015664, | |
| "grad_norm": 6.154874801635742, | |
| "learning_rate": 1.85492601948755e-06, | |
| "loss": 0.6044, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.4741514360313315, | |
| "grad_norm": 5.5303521156311035, | |
| "learning_rate": 1.8188379646337066e-06, | |
| "loss": 0.6833, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.4845953002610965, | |
| "grad_norm": 6.135169982910156, | |
| "learning_rate": 1.7827499097798632e-06, | |
| "loss": 0.7765, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.4950391644908616, | |
| "grad_norm": 7.397289752960205, | |
| "learning_rate": 1.7466618549260197e-06, | |
| "loss": 0.6748, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.5054830287206267, | |
| "grad_norm": 5.909689426422119, | |
| "learning_rate": 1.7105738000721763e-06, | |
| "loss": 0.6791, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.5054830287206267, | |
| "eval_loss": 0.955007791519165, | |
| "eval_runtime": 23.2476, | |
| "eval_samples_per_second": 36.606, | |
| "eval_steps_per_second": 4.603, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.5159268929503917, | |
| "grad_norm": 6.320558547973633, | |
| "learning_rate": 1.6744857452183328e-06, | |
| "loss": 0.6219, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.526370757180157, | |
| "grad_norm": 7.978168487548828, | |
| "learning_rate": 1.6383976903644894e-06, | |
| "loss": 0.8254, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.5368146214099214, | |
| "grad_norm": 6.5808210372924805, | |
| "learning_rate": 1.602309635510646e-06, | |
| "loss": 0.7243, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.547258485639687, | |
| "grad_norm": 4.769480228424072, | |
| "learning_rate": 1.5662215806568025e-06, | |
| "loss": 0.7497, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.5577023498694516, | |
| "grad_norm": 5.738780975341797, | |
| "learning_rate": 1.5301335258029592e-06, | |
| "loss": 0.7048, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.5681462140992166, | |
| "grad_norm": 5.658013343811035, | |
| "learning_rate": 1.4940454709491159e-06, | |
| "loss": 0.739, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.5785900783289817, | |
| "grad_norm": 6.587325096130371, | |
| "learning_rate": 1.4579574160952725e-06, | |
| "loss": 0.7076, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.5890339425587467, | |
| "grad_norm": 5.956645965576172, | |
| "learning_rate": 1.4218693612414292e-06, | |
| "loss": 0.6372, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.599477806788512, | |
| "grad_norm": 5.966655731201172, | |
| "learning_rate": 1.3857813063875859e-06, | |
| "loss": 0.6934, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.609921671018277, | |
| "grad_norm": 5.313653945922852, | |
| "learning_rate": 1.3496932515337425e-06, | |
| "loss": 0.7163, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.620365535248042, | |
| "grad_norm": 6.935596466064453, | |
| "learning_rate": 1.313605196679899e-06, | |
| "loss": 0.7104, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.6308093994778066, | |
| "grad_norm": 4.822442054748535, | |
| "learning_rate": 1.2775171418260556e-06, | |
| "loss": 0.6378, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.641253263707572, | |
| "grad_norm": 5.288422107696533, | |
| "learning_rate": 1.2414290869722123e-06, | |
| "loss": 0.6463, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.6516971279373367, | |
| "grad_norm": 6.668851852416992, | |
| "learning_rate": 1.205341032118369e-06, | |
| "loss": 0.7505, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.6621409921671018, | |
| "grad_norm": 5.71054220199585, | |
| "learning_rate": 1.1692529772645256e-06, | |
| "loss": 0.5856, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.672584856396867, | |
| "grad_norm": 6.284550666809082, | |
| "learning_rate": 1.1331649224106823e-06, | |
| "loss": 0.8122, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.683028720626632, | |
| "grad_norm": 8.781463623046875, | |
| "learning_rate": 1.097076867556839e-06, | |
| "loss": 0.7063, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.693472584856397, | |
| "grad_norm": 7.29454231262207, | |
| "learning_rate": 1.0609888127029954e-06, | |
| "loss": 0.7429, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.703916449086162, | |
| "grad_norm": 5.689371109008789, | |
| "learning_rate": 1.024900757849152e-06, | |
| "loss": 0.6658, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.714360313315927, | |
| "grad_norm": 7.286506175994873, | |
| "learning_rate": 9.888127029953087e-07, | |
| "loss": 0.748, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.714360313315927, | |
| "eval_loss": 0.9431054592132568, | |
| "eval_runtime": 23.1747, | |
| "eval_samples_per_second": 36.721, | |
| "eval_steps_per_second": 4.617, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.7248041775456917, | |
| "grad_norm": 5.635782241821289, | |
| "learning_rate": 9.527246481414652e-07, | |
| "loss": 0.673, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.7352480417754568, | |
| "grad_norm": 5.282413959503174, | |
| "learning_rate": 9.166365932876219e-07, | |
| "loss": 0.9037, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.745691906005222, | |
| "grad_norm": 7.922749042510986, | |
| "learning_rate": 8.805485384337785e-07, | |
| "loss": 0.6862, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.756135770234987, | |
| "grad_norm": 5.463962078094482, | |
| "learning_rate": 8.444604835799351e-07, | |
| "loss": 0.6, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.766579634464752, | |
| "grad_norm": 8.0007963180542, | |
| "learning_rate": 8.083724287260918e-07, | |
| "loss": 0.6688, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.777023498694517, | |
| "grad_norm": 7.617900371551514, | |
| "learning_rate": 7.722843738722483e-07, | |
| "loss": 0.7277, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.787467362924282, | |
| "grad_norm": 5.969784259796143, | |
| "learning_rate": 7.36196319018405e-07, | |
| "loss": 0.7085, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.7979112271540467, | |
| "grad_norm": 5.169407367706299, | |
| "learning_rate": 7.001082641645617e-07, | |
| "loss": 0.7344, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.8083550913838122, | |
| "grad_norm": 8.009687423706055, | |
| "learning_rate": 6.640202093107181e-07, | |
| "loss": 0.6457, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.818798955613577, | |
| "grad_norm": 7.4137187004089355, | |
| "learning_rate": 6.279321544568748e-07, | |
| "loss": 0.6416, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.829242819843342, | |
| "grad_norm": 5.35453462600708, | |
| "learning_rate": 5.918440996030314e-07, | |
| "loss": 0.7867, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.839686684073107, | |
| "grad_norm": 7.469908237457275, | |
| "learning_rate": 5.557560447491881e-07, | |
| "loss": 0.7955, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.850130548302872, | |
| "grad_norm": 6.326605319976807, | |
| "learning_rate": 5.196679898953446e-07, | |
| "loss": 0.6995, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.860574412532637, | |
| "grad_norm": 7.096553802490234, | |
| "learning_rate": 4.835799350415013e-07, | |
| "loss": 0.6625, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.871018276762402, | |
| "grad_norm": 5.128674507141113, | |
| "learning_rate": 4.474918801876579e-07, | |
| "loss": 0.6791, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.8814621409921672, | |
| "grad_norm": 6.457350254058838, | |
| "learning_rate": 4.1140382533381457e-07, | |
| "loss": 0.7586, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.891906005221932, | |
| "grad_norm": 6.231655597686768, | |
| "learning_rate": 3.753157704799711e-07, | |
| "loss": 0.677, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.9023498694516974, | |
| "grad_norm": 6.412544250488281, | |
| "learning_rate": 3.392277156261278e-07, | |
| "loss": 0.7449, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.912793733681462, | |
| "grad_norm": 3.725374698638916, | |
| "learning_rate": 3.031396607722844e-07, | |
| "loss": 0.5804, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.923237597911227, | |
| "grad_norm": 5.284286975860596, | |
| "learning_rate": 2.67051605918441e-07, | |
| "loss": 0.571, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.923237597911227, | |
| "eval_loss": 0.94057297706604, | |
| "eval_runtime": 23.1841, | |
| "eval_samples_per_second": 36.706, | |
| "eval_steps_per_second": 4.615, | |
| "step": 2800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2871, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.029626779598848e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |