Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2015, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004962779156327543, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.9982133995037223e-05, | |
| "loss": 4.0715, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.009925558312655087, | |
| "grad_norm": 7.404272079467773, | |
| "learning_rate": 1.996228287841191e-05, | |
| "loss": 2.8811, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01488833746898263, | |
| "grad_norm": 13.565997123718262, | |
| "learning_rate": 1.99424317617866e-05, | |
| "loss": 3.1689, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.019851116625310174, | |
| "grad_norm": 15.281591415405273, | |
| "learning_rate": 1.9922580645161292e-05, | |
| "loss": 3.0516, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02481389578163772, | |
| "grad_norm": 7.509647369384766, | |
| "learning_rate": 1.9902729528535983e-05, | |
| "loss": 2.9276, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02977667493796526, | |
| "grad_norm": 7.238119602203369, | |
| "learning_rate": 1.988287841191067e-05, | |
| "loss": 2.0641, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.034739454094292806, | |
| "grad_norm": 8.467841148376465, | |
| "learning_rate": 1.9863027295285362e-05, | |
| "loss": 1.8259, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03970223325062035, | |
| "grad_norm": 7.360463619232178, | |
| "learning_rate": 1.984317617866005e-05, | |
| "loss": 1.7238, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04466501240694789, | |
| "grad_norm": 6.133955478668213, | |
| "learning_rate": 1.982332506203474e-05, | |
| "loss": 1.3318, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04962779156327544, | |
| "grad_norm": 6.2159295082092285, | |
| "learning_rate": 1.980347394540943e-05, | |
| "loss": 1.1172, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05459057071960298, | |
| "grad_norm": 2.4206221103668213, | |
| "learning_rate": 1.9783622828784122e-05, | |
| "loss": 1.033, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05955334987593052, | |
| "grad_norm": 5.191859245300293, | |
| "learning_rate": 1.976377171215881e-05, | |
| "loss": 0.98, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 2.742246150970459, | |
| "learning_rate": 1.97439205955335e-05, | |
| "loss": 0.7428, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.06947890818858561, | |
| "grad_norm": 4.614717960357666, | |
| "learning_rate": 1.972406947890819e-05, | |
| "loss": 0.9041, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07444168734491315, | |
| "grad_norm": 3.7935266494750977, | |
| "learning_rate": 1.9704218362282882e-05, | |
| "loss": 0.5919, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0794044665012407, | |
| "grad_norm": 3.5459160804748535, | |
| "learning_rate": 1.968436724565757e-05, | |
| "loss": 0.7932, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08436724565756824, | |
| "grad_norm": 1.8404773473739624, | |
| "learning_rate": 1.966451612903226e-05, | |
| "loss": 0.7619, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.08933002481389578, | |
| "grad_norm": 2.2088851928710938, | |
| "learning_rate": 1.964466501240695e-05, | |
| "loss": 0.6204, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09429280397022333, | |
| "grad_norm": 1.566972017288208, | |
| "learning_rate": 1.962481389578164e-05, | |
| "loss": 0.7081, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.09925558312655088, | |
| "grad_norm": 1.225921630859375, | |
| "learning_rate": 1.960496277915633e-05, | |
| "loss": 0.5577, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10421836228287841, | |
| "grad_norm": 2.0241658687591553, | |
| "learning_rate": 1.958511166253102e-05, | |
| "loss": 0.5101, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.10918114143920596, | |
| "grad_norm": 2.1363158226013184, | |
| "learning_rate": 1.956526054590571e-05, | |
| "loss": 0.4952, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1141439205955335, | |
| "grad_norm": 2.7994675636291504, | |
| "learning_rate": 1.9545409429280396e-05, | |
| "loss": 0.5103, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.11910669975186104, | |
| "grad_norm": 2.4477338790893555, | |
| "learning_rate": 1.9525558312655087e-05, | |
| "loss": 0.3801, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.12406947890818859, | |
| "grad_norm": 2.0796492099761963, | |
| "learning_rate": 1.9505707196029778e-05, | |
| "loss": 0.4258, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 1.3138341903686523, | |
| "learning_rate": 1.948585607940447e-05, | |
| "loss": 0.4838, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.13399503722084366, | |
| "grad_norm": 1.5272380113601685, | |
| "learning_rate": 1.9466004962779156e-05, | |
| "loss": 0.366, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.13895781637717122, | |
| "grad_norm": 3.6165964603424072, | |
| "learning_rate": 1.9446153846153847e-05, | |
| "loss": 0.4171, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.14392059553349876, | |
| "grad_norm": 2.2593328952789307, | |
| "learning_rate": 1.9426302729528538e-05, | |
| "loss": 0.3028, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1488833746898263, | |
| "grad_norm": 1.5910056829452515, | |
| "learning_rate": 1.940645161290323e-05, | |
| "loss": 0.3047, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 1.767904281616211, | |
| "learning_rate": 1.9386600496277917e-05, | |
| "loss": 0.4552, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1588089330024814, | |
| "grad_norm": 1.7085593938827515, | |
| "learning_rate": 1.9366749379652608e-05, | |
| "loss": 0.2556, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.16377171215880892, | |
| "grad_norm": 1.355582594871521, | |
| "learning_rate": 1.9346898263027295e-05, | |
| "loss": 0.4337, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1687344913151365, | |
| "grad_norm": 0.8261783123016357, | |
| "learning_rate": 1.9327047146401986e-05, | |
| "loss": 0.3328, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.17369727047146402, | |
| "grad_norm": 0.8605831861495972, | |
| "learning_rate": 1.9307196029776677e-05, | |
| "loss": 0.2587, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.17866004962779156, | |
| "grad_norm": 2.7887861728668213, | |
| "learning_rate": 1.9287344913151368e-05, | |
| "loss": 0.3559, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.18362282878411912, | |
| "grad_norm": 1.0830411911010742, | |
| "learning_rate": 1.9267493796526055e-05, | |
| "loss": 0.1826, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.18858560794044665, | |
| "grad_norm": 3.8424558639526367, | |
| "learning_rate": 1.9247642679900746e-05, | |
| "loss": 0.2587, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 3.894951343536377, | |
| "learning_rate": 1.9227791563275434e-05, | |
| "loss": 0.2034, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.19851116625310175, | |
| "grad_norm": 3.139410972595215, | |
| "learning_rate": 1.9207940446650125e-05, | |
| "loss": 0.2387, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20347394540942929, | |
| "grad_norm": 1.7462635040283203, | |
| "learning_rate": 1.9188089330024816e-05, | |
| "loss": 0.2626, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.20843672456575682, | |
| "grad_norm": 4.002986907958984, | |
| "learning_rate": 1.9168238213399507e-05, | |
| "loss": 0.1782, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.21339950372208435, | |
| "grad_norm": 2.251654624938965, | |
| "learning_rate": 1.9148387096774194e-05, | |
| "loss": 0.1621, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.21836228287841192, | |
| "grad_norm": 0.6951057314872742, | |
| "learning_rate": 1.9128535980148885e-05, | |
| "loss": 0.1937, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.22332506203473945, | |
| "grad_norm": 0.8540646433830261, | |
| "learning_rate": 1.9108684863523576e-05, | |
| "loss": 0.1913, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.228287841191067, | |
| "grad_norm": 0.9929500222206116, | |
| "learning_rate": 1.9088833746898267e-05, | |
| "loss": 0.2433, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.23325062034739455, | |
| "grad_norm": 3.8289272785186768, | |
| "learning_rate": 1.9068982630272954e-05, | |
| "loss": 0.1903, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.23821339950372208, | |
| "grad_norm": 4.537591934204102, | |
| "learning_rate": 1.9049131513647645e-05, | |
| "loss": 0.2136, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.24317617866004962, | |
| "grad_norm": 0.20700696110725403, | |
| "learning_rate": 1.9029280397022333e-05, | |
| "loss": 0.1888, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.24813895781637718, | |
| "grad_norm": 0.4332411289215088, | |
| "learning_rate": 1.9009429280397024e-05, | |
| "loss": 0.2358, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2531017369727047, | |
| "grad_norm": 1.050528645515442, | |
| "learning_rate": 1.8989578163771715e-05, | |
| "loss": 0.0832, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 0.22720620036125183, | |
| "learning_rate": 1.8969727047146406e-05, | |
| "loss": 0.1272, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2630272952853598, | |
| "grad_norm": 3.576136350631714, | |
| "learning_rate": 1.8949875930521093e-05, | |
| "loss": 0.2066, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.2679900744416873, | |
| "grad_norm": 0.33265048265457153, | |
| "learning_rate": 1.8930024813895784e-05, | |
| "loss": 0.0556, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2729528535980149, | |
| "grad_norm": 0.6722800731658936, | |
| "learning_rate": 1.891017369727047e-05, | |
| "loss": 0.0807, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.27791563275434245, | |
| "grad_norm": 1.2628779411315918, | |
| "learning_rate": 1.8890322580645163e-05, | |
| "loss": 0.24, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.28287841191067, | |
| "grad_norm": 0.27271902561187744, | |
| "learning_rate": 1.8870471464019853e-05, | |
| "loss": 0.077, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2878411910669975, | |
| "grad_norm": 3.738210916519165, | |
| "learning_rate": 1.885062034739454e-05, | |
| "loss": 0.395, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.29280397022332505, | |
| "grad_norm": 0.6131235361099243, | |
| "learning_rate": 1.8830769230769232e-05, | |
| "loss": 0.1971, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2977667493796526, | |
| "grad_norm": 0.8646567463874817, | |
| "learning_rate": 1.8810918114143923e-05, | |
| "loss": 0.0669, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3027295285359802, | |
| "grad_norm": 4.964555263519287, | |
| "learning_rate": 1.8791066997518614e-05, | |
| "loss": 0.2324, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.5028713941574097, | |
| "learning_rate": 1.87712158808933e-05, | |
| "loss": 0.1819, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.31265508684863524, | |
| "grad_norm": 0.31731173396110535, | |
| "learning_rate": 1.8751364764267992e-05, | |
| "loss": 0.1463, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3176178660049628, | |
| "grad_norm": 0.23767873644828796, | |
| "learning_rate": 1.873151364764268e-05, | |
| "loss": 0.0874, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.5757867693901062, | |
| "learning_rate": 1.871166253101737e-05, | |
| "loss": 0.1267, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.32754342431761785, | |
| "grad_norm": 2.5683581829071045, | |
| "learning_rate": 1.869181141439206e-05, | |
| "loss": 0.0823, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3325062034739454, | |
| "grad_norm": 0.22655190527439117, | |
| "learning_rate": 1.8671960297766752e-05, | |
| "loss": 0.2087, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.337468982630273, | |
| "grad_norm": 2.4306397438049316, | |
| "learning_rate": 1.865210918114144e-05, | |
| "loss": 0.1034, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3424317617866005, | |
| "grad_norm": 0.11680830270051956, | |
| "learning_rate": 1.863225806451613e-05, | |
| "loss": 0.0316, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.34739454094292804, | |
| "grad_norm": 0.27029949426651, | |
| "learning_rate": 1.861240694789082e-05, | |
| "loss": 0.131, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3523573200992556, | |
| "grad_norm": 0.31934475898742676, | |
| "learning_rate": 1.859255583126551e-05, | |
| "loss": 0.1961, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3573200992555831, | |
| "grad_norm": 6.141274929046631, | |
| "learning_rate": 1.85727047146402e-05, | |
| "loss": 0.2118, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.36228287841191065, | |
| "grad_norm": 0.8305376172065735, | |
| "learning_rate": 1.855285359801489e-05, | |
| "loss": 0.0207, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.36724565756823824, | |
| "grad_norm": 1.411669135093689, | |
| "learning_rate": 1.853300248138958e-05, | |
| "loss": 0.0639, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.37220843672456577, | |
| "grad_norm": 2.125790596008301, | |
| "learning_rate": 1.851315136476427e-05, | |
| "loss": 0.2443, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3771712158808933, | |
| "grad_norm": 0.9730265736579895, | |
| "learning_rate": 1.849330024813896e-05, | |
| "loss": 0.0786, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.38213399503722084, | |
| "grad_norm": 3.10215163230896, | |
| "learning_rate": 1.847344913151365e-05, | |
| "loss": 0.3467, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 3.7202343940734863, | |
| "learning_rate": 1.845359801488834e-05, | |
| "loss": 0.2285, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3920595533498759, | |
| "grad_norm": 0.5115141272544861, | |
| "learning_rate": 1.843374689826303e-05, | |
| "loss": 0.1705, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3970223325062035, | |
| "grad_norm": 0.4430786669254303, | |
| "learning_rate": 1.8413895781637717e-05, | |
| "loss": 0.0631, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.40198511166253104, | |
| "grad_norm": 4.2651143074035645, | |
| "learning_rate": 1.8394044665012408e-05, | |
| "loss": 0.1615, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.40694789081885857, | |
| "grad_norm": 5.431081295013428, | |
| "learning_rate": 1.83741935483871e-05, | |
| "loss": 0.0325, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4119106699751861, | |
| "grad_norm": 1.7246816158294678, | |
| "learning_rate": 1.835434243176179e-05, | |
| "loss": 0.0818, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.41687344913151364, | |
| "grad_norm": 0.11597002297639847, | |
| "learning_rate": 1.8334491315136478e-05, | |
| "loss": 0.1661, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4218362282878412, | |
| "grad_norm": 5.003543853759766, | |
| "learning_rate": 1.831464019851117e-05, | |
| "loss": 0.0775, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4267990074441687, | |
| "grad_norm": 6.160369396209717, | |
| "learning_rate": 1.8294789081885856e-05, | |
| "loss": 0.0777, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4317617866004963, | |
| "grad_norm": 0.4120528995990753, | |
| "learning_rate": 1.8274937965260547e-05, | |
| "loss": 0.0515, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.43672456575682383, | |
| "grad_norm": 0.1204703077673912, | |
| "learning_rate": 1.8255086848635238e-05, | |
| "loss": 0.0356, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.44168734491315137, | |
| "grad_norm": 0.09946464747190475, | |
| "learning_rate": 1.8235235732009925e-05, | |
| "loss": 0.1525, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4466501240694789, | |
| "grad_norm": 1.1095144748687744, | |
| "learning_rate": 1.8215384615384616e-05, | |
| "loss": 0.098, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.45161290322580644, | |
| "grad_norm": 0.15330803394317627, | |
| "learning_rate": 1.8195533498759307e-05, | |
| "loss": 0.0986, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.456575682382134, | |
| "grad_norm": 0.231792613863945, | |
| "learning_rate": 1.8175682382133998e-05, | |
| "loss": 0.1464, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 0.15597473084926605, | |
| "learning_rate": 1.8155831265508686e-05, | |
| "loss": 0.166, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4665012406947891, | |
| "grad_norm": 1.0127346515655518, | |
| "learning_rate": 1.8135980148883377e-05, | |
| "loss": 0.1167, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.47146401985111663, | |
| "grad_norm": 2.508481740951538, | |
| "learning_rate": 1.8116129032258064e-05, | |
| "loss": 0.0913, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.47642679900744417, | |
| "grad_norm": 0.06334420293569565, | |
| "learning_rate": 1.8096277915632755e-05, | |
| "loss": 0.0285, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.4813895781637717, | |
| "grad_norm": 0.1302240639925003, | |
| "learning_rate": 1.8076426799007446e-05, | |
| "loss": 0.123, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.48635235732009924, | |
| "grad_norm": 1.0986523628234863, | |
| "learning_rate": 1.8056575682382137e-05, | |
| "loss": 0.0978, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4913151364764268, | |
| "grad_norm": 0.08456585556268692, | |
| "learning_rate": 1.8036724565756824e-05, | |
| "loss": 0.253, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.49627791563275436, | |
| "grad_norm": 3.1052098274230957, | |
| "learning_rate": 1.8016873449131515e-05, | |
| "loss": 0.0929, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5012406947890818, | |
| "grad_norm": 2.032149314880371, | |
| "learning_rate": 1.7997022332506203e-05, | |
| "loss": 0.2004, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5062034739454094, | |
| "grad_norm": 0.11190329492092133, | |
| "learning_rate": 1.7977171215880894e-05, | |
| "loss": 0.0892, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.511166253101737, | |
| "grad_norm": 3.9147121906280518, | |
| "learning_rate": 1.7957320099255585e-05, | |
| "loss": 0.1345, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 0.1115802675485611, | |
| "learning_rate": 1.7937468982630276e-05, | |
| "loss": 0.0167, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5210918114143921, | |
| "grad_norm": 0.09744033217430115, | |
| "learning_rate": 1.7917617866004963e-05, | |
| "loss": 0.0726, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5260545905707196, | |
| "grad_norm": 8.279594421386719, | |
| "learning_rate": 1.7897766749379654e-05, | |
| "loss": 0.1851, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5310173697270472, | |
| "grad_norm": 0.12151824682950974, | |
| "learning_rate": 1.7877915632754345e-05, | |
| "loss": 0.1491, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5359801488833746, | |
| "grad_norm": 0.8291270136833191, | |
| "learning_rate": 1.7858064516129036e-05, | |
| "loss": 0.0114, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5409429280397022, | |
| "grad_norm": 3.0179080963134766, | |
| "learning_rate": 1.7838213399503723e-05, | |
| "loss": 0.1433, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5459057071960298, | |
| "grad_norm": 3.6688222885131836, | |
| "learning_rate": 1.7818362282878414e-05, | |
| "loss": 0.2806, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5508684863523573, | |
| "grad_norm": 0.08318022638559341, | |
| "learning_rate": 1.7798511166253102e-05, | |
| "loss": 0.0321, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5558312655086849, | |
| "grad_norm": 0.16207154095172882, | |
| "learning_rate": 1.7778660049627793e-05, | |
| "loss": 0.0684, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5607940446650124, | |
| "grad_norm": 7.777888774871826, | |
| "learning_rate": 1.7758808933002484e-05, | |
| "loss": 0.1594, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.56575682382134, | |
| "grad_norm": 0.07727096229791641, | |
| "learning_rate": 1.7738957816377175e-05, | |
| "loss": 0.0785, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5707196029776674, | |
| "grad_norm": 0.2622874975204468, | |
| "learning_rate": 1.7719106699751862e-05, | |
| "loss": 0.1995, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.575682382133995, | |
| "grad_norm": 3.662970542907715, | |
| "learning_rate": 1.7699255583126553e-05, | |
| "loss": 0.1673, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5806451612903226, | |
| "grad_norm": 7.050316333770752, | |
| "learning_rate": 1.767940446650124e-05, | |
| "loss": 0.2565, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5856079404466501, | |
| "grad_norm": 0.09787000715732574, | |
| "learning_rate": 1.765955334987593e-05, | |
| "loss": 0.1664, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5905707196029777, | |
| "grad_norm": 0.08483708649873734, | |
| "learning_rate": 1.7639702233250622e-05, | |
| "loss": 0.1026, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5955334987593052, | |
| "grad_norm": 2.9109203815460205, | |
| "learning_rate": 1.761985111662531e-05, | |
| "loss": 0.1508, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6004962779156328, | |
| "grad_norm": 0.09153315424919128, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.0846, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6054590570719603, | |
| "grad_norm": 0.5280864834785461, | |
| "learning_rate": 1.7580148883374692e-05, | |
| "loss": 0.1233, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6104218362282878, | |
| "grad_norm": 0.20713864266872406, | |
| "learning_rate": 1.7560297766749383e-05, | |
| "loss": 0.0837, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.20489178597927094, | |
| "learning_rate": 1.754044665012407e-05, | |
| "loss": 0.0938, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6203473945409429, | |
| "grad_norm": 0.0775533989071846, | |
| "learning_rate": 1.752059553349876e-05, | |
| "loss": 0.0832, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6253101736972705, | |
| "grad_norm": 0.16079996526241302, | |
| "learning_rate": 1.750074441687345e-05, | |
| "loss": 0.1211, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.630272952853598, | |
| "grad_norm": 0.06369619816541672, | |
| "learning_rate": 1.748089330024814e-05, | |
| "loss": 0.0561, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6352357320099256, | |
| "grad_norm": 2.8441195487976074, | |
| "learning_rate": 1.746104218362283e-05, | |
| "loss": 0.1201, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6401985111662531, | |
| "grad_norm": 6.837566375732422, | |
| "learning_rate": 1.744119106699752e-05, | |
| "loss": 0.1761, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.06762542575597763, | |
| "learning_rate": 1.742133995037221e-05, | |
| "loss": 0.1454, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6501240694789082, | |
| "grad_norm": 0.10768511891365051, | |
| "learning_rate": 1.74014888337469e-05, | |
| "loss": 0.1109, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.6550868486352357, | |
| "grad_norm": 0.14378197491168976, | |
| "learning_rate": 1.7381637717121587e-05, | |
| "loss": 0.0964, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.6600496277915633, | |
| "grad_norm": 5.744873523712158, | |
| "learning_rate": 1.736178660049628e-05, | |
| "loss": 0.1388, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.6650124069478908, | |
| "grad_norm": 1.2244545221328735, | |
| "learning_rate": 1.734193548387097e-05, | |
| "loss": 0.0129, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.6699751861042184, | |
| "grad_norm": 6.384188175201416, | |
| "learning_rate": 1.732208436724566e-05, | |
| "loss": 0.3897, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.674937965260546, | |
| "grad_norm": 11.079994201660156, | |
| "learning_rate": 1.7302233250620348e-05, | |
| "loss": 0.1177, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6799007444168734, | |
| "grad_norm": 0.9940871000289917, | |
| "learning_rate": 1.728238213399504e-05, | |
| "loss": 0.0522, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.684863523573201, | |
| "grad_norm": 4.725884914398193, | |
| "learning_rate": 1.726253101736973e-05, | |
| "loss": 0.2414, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6898263027295285, | |
| "grad_norm": 8.218222618103027, | |
| "learning_rate": 1.724267990074442e-05, | |
| "loss": 0.0943, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6947890818858561, | |
| "grad_norm": 0.0562286414206028, | |
| "learning_rate": 1.7222828784119108e-05, | |
| "loss": 0.0122, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6997518610421837, | |
| "grad_norm": 0.0633021891117096, | |
| "learning_rate": 1.72029776674938e-05, | |
| "loss": 0.0148, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7047146401985112, | |
| "grad_norm": 0.23889249563217163, | |
| "learning_rate": 1.7183126550868486e-05, | |
| "loss": 0.0958, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7096774193548387, | |
| "grad_norm": 3.1089024543762207, | |
| "learning_rate": 1.7163275434243177e-05, | |
| "loss": 0.2354, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7146401985111662, | |
| "grad_norm": 0.04075319692492485, | |
| "learning_rate": 1.7143424317617868e-05, | |
| "loss": 0.1398, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7196029776674938, | |
| "grad_norm": 0.2210625857114792, | |
| "learning_rate": 1.712357320099256e-05, | |
| "loss": 0.0855, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7245657568238213, | |
| "grad_norm": 6.384485244750977, | |
| "learning_rate": 1.7103722084367247e-05, | |
| "loss": 0.2229, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7295285359801489, | |
| "grad_norm": 0.10448583960533142, | |
| "learning_rate": 1.7083870967741938e-05, | |
| "loss": 0.0054, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7344913151364765, | |
| "grad_norm": 0.06005546450614929, | |
| "learning_rate": 1.7064019851116625e-05, | |
| "loss": 0.0955, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.739454094292804, | |
| "grad_norm": 0.09095001220703125, | |
| "learning_rate": 1.7044168734491316e-05, | |
| "loss": 0.1075, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.7444168734491315, | |
| "grad_norm": 0.05384785681962967, | |
| "learning_rate": 1.7024317617866007e-05, | |
| "loss": 0.1336, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.749379652605459, | |
| "grad_norm": 3.4298574924468994, | |
| "learning_rate": 1.7004466501240694e-05, | |
| "loss": 0.0192, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.7543424317617866, | |
| "grad_norm": 0.11529748886823654, | |
| "learning_rate": 1.6984615384615385e-05, | |
| "loss": 0.1036, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7593052109181141, | |
| "grad_norm": 13.056891441345215, | |
| "learning_rate": 1.6964764267990076e-05, | |
| "loss": 0.0958, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.7642679900744417, | |
| "grad_norm": 0.05889654532074928, | |
| "learning_rate": 1.6944913151364767e-05, | |
| "loss": 0.1754, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 5.287586688995361, | |
| "learning_rate": 1.6925062034739455e-05, | |
| "loss": 0.1773, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 4.1205878257751465, | |
| "learning_rate": 1.6905210918114146e-05, | |
| "loss": 0.2493, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.7791563275434243, | |
| "grad_norm": 0.19854427874088287, | |
| "learning_rate": 1.6885359801488833e-05, | |
| "loss": 0.1686, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.7841191066997518, | |
| "grad_norm": 0.05931422859430313, | |
| "learning_rate": 1.6865508684863524e-05, | |
| "loss": 0.0397, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.7890818858560794, | |
| "grad_norm": 0.10935617238283157, | |
| "learning_rate": 1.6845657568238215e-05, | |
| "loss": 0.0285, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.794044665012407, | |
| "grad_norm": 0.10486113280057907, | |
| "learning_rate": 1.6825806451612906e-05, | |
| "loss": 0.0781, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7990074441687345, | |
| "grad_norm": 0.7170459032058716, | |
| "learning_rate": 1.6805955334987593e-05, | |
| "loss": 0.0829, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8039702233250621, | |
| "grad_norm": 0.3470781445503235, | |
| "learning_rate": 1.6786104218362284e-05, | |
| "loss": 0.053, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8089330024813896, | |
| "grad_norm": 4.012697219848633, | |
| "learning_rate": 1.6766253101736972e-05, | |
| "loss": 0.1086, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8138957816377171, | |
| "grad_norm": 0.302435040473938, | |
| "learning_rate": 1.6746401985111663e-05, | |
| "loss": 0.0111, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8188585607940446, | |
| "grad_norm": 0.06520848721265793, | |
| "learning_rate": 1.6726550868486354e-05, | |
| "loss": 0.0726, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8238213399503722, | |
| "grad_norm": 0.09845948964357376, | |
| "learning_rate": 1.6706699751861045e-05, | |
| "loss": 0.0094, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8287841191066998, | |
| "grad_norm": 0.03756513074040413, | |
| "learning_rate": 1.6686848635235732e-05, | |
| "loss": 0.23, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8337468982630273, | |
| "grad_norm": 0.39116331934928894, | |
| "learning_rate": 1.6666997518610423e-05, | |
| "loss": 0.034, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.8387096774193549, | |
| "grad_norm": 0.8433384299278259, | |
| "learning_rate": 1.6647146401985114e-05, | |
| "loss": 0.0264, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.8436724565756824, | |
| "grad_norm": 0.05700427293777466, | |
| "learning_rate": 1.6627295285359805e-05, | |
| "loss": 0.2487, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8486352357320099, | |
| "grad_norm": 8.650578498840332, | |
| "learning_rate": 1.6607444168734492e-05, | |
| "loss": 0.2098, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.8535980148883374, | |
| "grad_norm": 0.07815321534872055, | |
| "learning_rate": 1.6587593052109183e-05, | |
| "loss": 0.0113, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.858560794044665, | |
| "grad_norm": 0.11031467467546463, | |
| "learning_rate": 1.656774193548387e-05, | |
| "loss": 0.0744, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.8635235732009926, | |
| "grad_norm": 0.7240772247314453, | |
| "learning_rate": 1.6547890818858562e-05, | |
| "loss": 0.1165, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.8684863523573201, | |
| "grad_norm": 0.03234798088669777, | |
| "learning_rate": 1.6528039702233253e-05, | |
| "loss": 0.0678, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8734491315136477, | |
| "grad_norm": 0.024767836555838585, | |
| "learning_rate": 1.6508188585607944e-05, | |
| "loss": 0.0519, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.8784119106699751, | |
| "grad_norm": 0.03122510015964508, | |
| "learning_rate": 1.648833746898263e-05, | |
| "loss": 0.1386, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.8833746898263027, | |
| "grad_norm": 0.824370265007019, | |
| "learning_rate": 1.6468486352357322e-05, | |
| "loss": 0.164, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.8883374689826302, | |
| "grad_norm": 5.419961452484131, | |
| "learning_rate": 1.644863523573201e-05, | |
| "loss": 0.1668, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.8933002481389578, | |
| "grad_norm": 0.03851868212223053, | |
| "learning_rate": 1.64287841191067e-05, | |
| "loss": 0.0614, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8982630272952854, | |
| "grad_norm": 0.06279598921537399, | |
| "learning_rate": 1.640893300248139e-05, | |
| "loss": 0.0031, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 0.04613855481147766, | |
| "learning_rate": 1.6389081885856082e-05, | |
| "loss": 0.1774, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9081885856079405, | |
| "grad_norm": 0.6312947869300842, | |
| "learning_rate": 1.636923076923077e-05, | |
| "loss": 0.0815, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.913151364764268, | |
| "grad_norm": 0.161106139421463, | |
| "learning_rate": 1.634937965260546e-05, | |
| "loss": 0.1977, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9181141439205955, | |
| "grad_norm": 0.03545048087835312, | |
| "learning_rate": 1.6329528535980152e-05, | |
| "loss": 0.0353, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.1279628425836563, | |
| "learning_rate": 1.630967741935484e-05, | |
| "loss": 0.0503, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9280397022332506, | |
| "grad_norm": 8.406203269958496, | |
| "learning_rate": 1.628982630272953e-05, | |
| "loss": 0.0463, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9330024813895782, | |
| "grad_norm": 10.595025062561035, | |
| "learning_rate": 1.6269975186104218e-05, | |
| "loss": 0.0173, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.9379652605459057, | |
| "grad_norm": 0.7097483277320862, | |
| "learning_rate": 1.625012406947891e-05, | |
| "loss": 0.0596, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.9429280397022333, | |
| "grad_norm": 7.379908561706543, | |
| "learning_rate": 1.62302729528536e-05, | |
| "loss": 0.2075, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9478908188585607, | |
| "grad_norm": 0.1548600196838379, | |
| "learning_rate": 1.621042183622829e-05, | |
| "loss": 0.2226, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.9528535980148883, | |
| "grad_norm": 0.026123059913516045, | |
| "learning_rate": 1.6190570719602978e-05, | |
| "loss": 0.1507, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9578163771712159, | |
| "grad_norm": 0.08188609778881073, | |
| "learning_rate": 1.617071960297767e-05, | |
| "loss": 0.1326, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.9627791563275434, | |
| "grad_norm": 0.3817611634731293, | |
| "learning_rate": 1.6150868486352356e-05, | |
| "loss": 0.2298, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.048138681799173355, | |
| "learning_rate": 1.6131017369727047e-05, | |
| "loss": 0.0676, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9727047146401985, | |
| "grad_norm": 0.06823063641786575, | |
| "learning_rate": 1.6111166253101738e-05, | |
| "loss": 0.0727, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.9776674937965261, | |
| "grad_norm": 0.031212667003273964, | |
| "learning_rate": 1.609131513647643e-05, | |
| "loss": 0.0943, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.9826302729528535, | |
| "grad_norm": 7.244811534881592, | |
| "learning_rate": 1.6071464019851117e-05, | |
| "loss": 0.1229, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.9875930521091811, | |
| "grad_norm": 0.06972146779298782, | |
| "learning_rate": 1.6051612903225808e-05, | |
| "loss": 0.3056, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.9925558312655087, | |
| "grad_norm": 0.026395201683044434, | |
| "learning_rate": 1.60317617866005e-05, | |
| "loss": 0.0214, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9975186104218362, | |
| "grad_norm": 1.8401563167572021, | |
| "learning_rate": 1.601191066997519e-05, | |
| "loss": 0.077, | |
| "step": 2010 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10075, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2178492222720000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |