Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 10000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 17.069135665893555, | |
| "learning_rate": 3.96e-06, | |
| "loss": 1.9267, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 7.49233341217041, | |
| "learning_rate": 7.960000000000002e-06, | |
| "loss": 1.7814, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 10.066174507141113, | |
| "learning_rate": 1.196e-05, | |
| "loss": 1.7612, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 8.212127685546875, | |
| "learning_rate": 1.5960000000000003e-05, | |
| "loss": 1.7866, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 5.494411945343018, | |
| "learning_rate": 1.9960000000000002e-05, | |
| "loss": 1.7809, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 6.4578728675842285, | |
| "learning_rate": 1.9980100502512564e-05, | |
| "loss": 1.7893, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 4.508376121520996, | |
| "learning_rate": 1.996020100502513e-05, | |
| "loss": 1.7851, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 6.78291654586792, | |
| "learning_rate": 1.9940100502512564e-05, | |
| "loss": 1.758, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.534212589263916, | |
| "learning_rate": 1.9920000000000002e-05, | |
| "loss": 1.7659, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 6.295835018157959, | |
| "learning_rate": 1.9899899497487437e-05, | |
| "loss": 1.7201, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 5.502695083618164, | |
| "learning_rate": 1.987979899497488e-05, | |
| "loss": 1.7634, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 6.535002708435059, | |
| "learning_rate": 1.9859698492462313e-05, | |
| "loss": 1.7322, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 5.8399834632873535, | |
| "learning_rate": 1.983959798994975e-05, | |
| "loss": 1.7306, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.5027928352355957, | |
| "learning_rate": 1.9819497487437185e-05, | |
| "loss": 1.6763, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.470935821533203, | |
| "learning_rate": 1.9799396984924623e-05, | |
| "loss": 1.687, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.546024799346924, | |
| "learning_rate": 1.977929648241206e-05, | |
| "loss": 1.6944, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 6.186180591583252, | |
| "learning_rate": 1.97591959798995e-05, | |
| "loss": 1.7085, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.819445848464966, | |
| "learning_rate": 1.9739095477386937e-05, | |
| "loss": 1.6765, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.2868876457214355, | |
| "learning_rate": 1.9718994974874372e-05, | |
| "loss": 1.6888, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 6.690129280090332, | |
| "learning_rate": 1.969889447236181e-05, | |
| "loss": 1.675, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 7.435989856719971, | |
| "learning_rate": 1.9678793969849248e-05, | |
| "loss": 1.6913, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.209521293640137, | |
| "learning_rate": 1.9658693467336686e-05, | |
| "loss": 1.7021, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.704258441925049, | |
| "learning_rate": 1.963859296482412e-05, | |
| "loss": 1.6973, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.3551025390625, | |
| "learning_rate": 1.9618492462311562e-05, | |
| "loss": 1.6951, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.278153419494629, | |
| "learning_rate": 1.9598391959798996e-05, | |
| "loss": 1.6749, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.053964614868164, | |
| "learning_rate": 1.9578291457286434e-05, | |
| "loss": 1.6839, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.125670909881592, | |
| "learning_rate": 1.955819095477387e-05, | |
| "loss": 1.6734, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.6489973068237305, | |
| "learning_rate": 1.953809045226131e-05, | |
| "loss": 1.6436, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 6.31420373916626, | |
| "learning_rate": 1.9517989949748745e-05, | |
| "loss": 1.6781, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.799983501434326, | |
| "learning_rate": 1.9497889447236183e-05, | |
| "loss": 1.6766, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.4388930797576904, | |
| "learning_rate": 1.9477788944723618e-05, | |
| "loss": 1.6613, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.100885391235352, | |
| "learning_rate": 1.945768844221106e-05, | |
| "loss": 1.6885, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.916701793670654, | |
| "learning_rate": 1.9437587939698493e-05, | |
| "loss": 1.6705, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.234365463256836, | |
| "learning_rate": 1.941748743718593e-05, | |
| "loss": 1.6803, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.592808723449707, | |
| "learning_rate": 1.939738693467337e-05, | |
| "loss": 1.6222, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.33900260925293, | |
| "learning_rate": 1.9377286432160804e-05, | |
| "loss": 1.6275, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.385664939880371, | |
| "learning_rate": 1.9357185929648242e-05, | |
| "loss": 1.6234, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.868696689605713, | |
| "learning_rate": 1.933708542713568e-05, | |
| "loss": 1.6584, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.236993789672852, | |
| "learning_rate": 1.9316984924623118e-05, | |
| "loss": 1.6342, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.226047515869141, | |
| "learning_rate": 1.9296884422110552e-05, | |
| "loss": 1.6109, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.2877655029296875, | |
| "learning_rate": 1.9276783919597994e-05, | |
| "loss": 1.632, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.26241397857666, | |
| "learning_rate": 1.925668341708543e-05, | |
| "loss": 1.5813, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.126381874084473, | |
| "learning_rate": 1.9236582914572866e-05, | |
| "loss": 1.6013, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.520874977111816, | |
| "learning_rate": 1.92164824120603e-05, | |
| "loss": 1.6499, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.288824558258057, | |
| "learning_rate": 1.9196381909547742e-05, | |
| "loss": 1.6151, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.139670372009277, | |
| "learning_rate": 1.9176281407035177e-05, | |
| "loss": 1.61, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.32205867767334, | |
| "learning_rate": 1.9156180904522615e-05, | |
| "loss": 1.6125, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.354684829711914, | |
| "learning_rate": 1.913608040201005e-05, | |
| "loss": 1.6219, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.567023754119873, | |
| "learning_rate": 1.911597989949749e-05, | |
| "loss": 1.6326, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.753734588623047, | |
| "learning_rate": 1.9095879396984925e-05, | |
| "loss": 1.5992, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.663640975952148, | |
| "learning_rate": 1.9075778894472363e-05, | |
| "loss": 1.5792, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.779416561126709, | |
| "learning_rate": 1.90556783919598e-05, | |
| "loss": 1.6072, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.422142505645752, | |
| "learning_rate": 1.903557788944724e-05, | |
| "loss": 1.6244, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.999263286590576, | |
| "learning_rate": 1.9015477386934674e-05, | |
| "loss": 1.6056, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.253634452819824, | |
| "learning_rate": 1.8995376884422112e-05, | |
| "loss": 1.5862, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.289574146270752, | |
| "learning_rate": 1.897527638190955e-05, | |
| "loss": 1.6188, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.021580219268799, | |
| "learning_rate": 1.8955175879396988e-05, | |
| "loss": 1.5506, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.235142707824707, | |
| "learning_rate": 1.893527638190955e-05, | |
| "loss": 1.5975, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.086207866668701, | |
| "learning_rate": 1.8915175879396988e-05, | |
| "loss": 1.5796, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.470453262329102, | |
| "learning_rate": 1.8895075376884423e-05, | |
| "loss": 1.5871, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.826024532318115, | |
| "learning_rate": 1.887497487437186e-05, | |
| "loss": 1.5398, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.6210761070251465, | |
| "learning_rate": 1.88548743718593e-05, | |
| "loss": 1.6186, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.72627592086792, | |
| "learning_rate": 1.8834773869346733e-05, | |
| "loss": 1.5377, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.252579212188721, | |
| "learning_rate": 1.881467336683417e-05, | |
| "loss": 1.6209, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.615140914916992, | |
| "learning_rate": 1.879457286432161e-05, | |
| "loss": 1.5482, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 3.57523512840271, | |
| "learning_rate": 1.8774472361809047e-05, | |
| "loss": 1.5903, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.865254878997803, | |
| "learning_rate": 1.8754371859296482e-05, | |
| "loss": 1.596, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.929252624511719, | |
| "learning_rate": 1.8734271356783923e-05, | |
| "loss": 1.538, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.798943281173706, | |
| "learning_rate": 1.8714170854271358e-05, | |
| "loss": 1.5793, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.028155326843262, | |
| "learning_rate": 1.8694070351758796e-05, | |
| "loss": 1.5486, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.279233932495117, | |
| "learning_rate": 1.867396984924623e-05, | |
| "loss": 1.5538, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.5549476146698, | |
| "learning_rate": 1.8653869346733672e-05, | |
| "loss": 1.571, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.669614315032959, | |
| "learning_rate": 1.8633768844221106e-05, | |
| "loss": 1.5943, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.953083515167236, | |
| "learning_rate": 1.8613668341708544e-05, | |
| "loss": 1.5622, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.570889472961426, | |
| "learning_rate": 1.8593567839195982e-05, | |
| "loss": 1.5548, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.883716583251953, | |
| "learning_rate": 1.857346733668342e-05, | |
| "loss": 1.5523, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.618870258331299, | |
| "learning_rate": 1.8553366834170855e-05, | |
| "loss": 1.573, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.305469989776611, | |
| "learning_rate": 1.8533266331658293e-05, | |
| "loss": 1.5568, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.411215782165527, | |
| "learning_rate": 1.851316582914573e-05, | |
| "loss": 1.5381, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.859926700592041, | |
| "learning_rate": 1.849306532663317e-05, | |
| "loss": 1.5554, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 3.7912240028381348, | |
| "learning_rate": 1.8473165829145728e-05, | |
| "loss": 1.4991, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.7156901359558105, | |
| "learning_rate": 1.845306532663317e-05, | |
| "loss": 1.567, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.494976043701172, | |
| "learning_rate": 1.8432964824120604e-05, | |
| "loss": 1.5879, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 4.401707172393799, | |
| "learning_rate": 1.8412864321608042e-05, | |
| "loss": 1.5464, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.136995315551758, | |
| "learning_rate": 1.839276381909548e-05, | |
| "loss": 1.5808, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.482088565826416, | |
| "learning_rate": 1.8372663316582918e-05, | |
| "loss": 1.5014, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.293674945831299, | |
| "learning_rate": 1.8352562814070352e-05, | |
| "loss": 1.5409, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.073010444641113, | |
| "learning_rate": 1.833246231155779e-05, | |
| "loss": 1.5292, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.509156703948975, | |
| "learning_rate": 1.8312361809045228e-05, | |
| "loss": 1.5243, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 7.400144577026367, | |
| "learning_rate": 1.8292261306532663e-05, | |
| "loss": 1.5577, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.070242881774902, | |
| "learning_rate": 1.8272160804020104e-05, | |
| "loss": 1.5394, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.7604804039001465, | |
| "learning_rate": 1.825206030150754e-05, | |
| "loss": 1.5335, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.665848731994629, | |
| "learning_rate": 1.8231959798994977e-05, | |
| "loss": 1.5484, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 3.539947271347046, | |
| "learning_rate": 1.821185929648241e-05, | |
| "loss": 1.5424, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 6.976469039916992, | |
| "learning_rate": 1.8191758793969853e-05, | |
| "loss": 1.5128, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.112757682800293, | |
| "learning_rate": 1.8171658291457287e-05, | |
| "loss": 1.5085, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.922628402709961, | |
| "learning_rate": 1.8151557788944725e-05, | |
| "loss": 1.5107, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.697315692901611, | |
| "learning_rate": 1.813145728643216e-05, | |
| "loss": 1.5258, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 7.6928253173828125, | |
| "learning_rate": 1.81113567839196e-05, | |
| "loss": 1.525, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.736105442047119, | |
| "learning_rate": 1.8091256281407036e-05, | |
| "loss": 1.5156, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.5033862590789795, | |
| "eval_runtime": 21.8773, | |
| "eval_samples_per_second": 45.709, | |
| "eval_steps_per_second": 5.714, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 6.009729385375977, | |
| "learning_rate": 1.8071155778894474e-05, | |
| "loss": 1.564, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.207152366638184, | |
| "learning_rate": 1.8051256281407036e-05, | |
| "loss": 1.5506, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 4.1969685554504395, | |
| "learning_rate": 1.8031155778894474e-05, | |
| "loss": 1.508, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 4.609541416168213, | |
| "learning_rate": 1.801105527638191e-05, | |
| "loss": 1.5556, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 4.768685340881348, | |
| "learning_rate": 1.799095477386935e-05, | |
| "loss": 1.5357, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 6.3910651206970215, | |
| "learning_rate": 1.7970854271356785e-05, | |
| "loss": 1.5377, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 4.312323570251465, | |
| "learning_rate": 1.7950753768844223e-05, | |
| "loss": 1.5454, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 3.9479427337646484, | |
| "learning_rate": 1.793065326633166e-05, | |
| "loss": 1.4863, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 5.744295120239258, | |
| "learning_rate": 1.79105527638191e-05, | |
| "loss": 1.5507, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 4.2211833000183105, | |
| "learning_rate": 1.7890452261306533e-05, | |
| "loss": 1.5182, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.841630458831787, | |
| "learning_rate": 1.787035175879397e-05, | |
| "loss": 1.4814, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.714913845062256, | |
| "learning_rate": 1.785025125628141e-05, | |
| "loss": 1.4904, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 6.587597846984863, | |
| "learning_rate": 1.7830150753768847e-05, | |
| "loss": 1.5045, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 4.343375205993652, | |
| "learning_rate": 1.7810050251256285e-05, | |
| "loss": 1.5143, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 7.786270618438721, | |
| "learning_rate": 1.778994974874372e-05, | |
| "loss": 1.496, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 7.0261054039001465, | |
| "learning_rate": 1.7769849246231158e-05, | |
| "loss": 1.5113, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 5.448154449462891, | |
| "learning_rate": 1.7749748743718592e-05, | |
| "loss": 1.4699, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 5.4564361572265625, | |
| "learning_rate": 1.7729648241206034e-05, | |
| "loss": 1.4909, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 5.704242706298828, | |
| "learning_rate": 1.7709547738693468e-05, | |
| "loss": 1.4931, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 4.819602966308594, | |
| "learning_rate": 1.7689447236180906e-05, | |
| "loss": 1.4981, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.80843186378479, | |
| "learning_rate": 1.766934673366834e-05, | |
| "loss": 1.4753, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 4.010366439819336, | |
| "learning_rate": 1.7649246231155782e-05, | |
| "loss": 1.4899, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 6.8596391677856445, | |
| "learning_rate": 1.7629145728643217e-05, | |
| "loss": 1.4951, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 5.7791643142700195, | |
| "learning_rate": 1.7609246231155782e-05, | |
| "loss": 1.4841, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 4.629549503326416, | |
| "learning_rate": 1.7589145728643217e-05, | |
| "loss": 1.4918, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 4.676841735839844, | |
| "learning_rate": 1.7569045226130655e-05, | |
| "loss": 1.4934, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 5.469869613647461, | |
| "learning_rate": 1.754894472361809e-05, | |
| "loss": 1.4767, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.605990886688232, | |
| "learning_rate": 1.752884422110553e-05, | |
| "loss": 1.4635, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 5.055588722229004, | |
| "learning_rate": 1.7508743718592966e-05, | |
| "loss": 1.4925, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 4.2058916091918945, | |
| "learning_rate": 1.7488643216080404e-05, | |
| "loss": 1.4988, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 13.90904426574707, | |
| "learning_rate": 1.7468542713567838e-05, | |
| "loss": 1.4883, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 5.441417694091797, | |
| "learning_rate": 1.744844221105528e-05, | |
| "loss": 1.4817, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 4.476733684539795, | |
| "learning_rate": 1.7428341708542714e-05, | |
| "loss": 1.5099, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 4.831094264984131, | |
| "learning_rate": 1.7408241206030152e-05, | |
| "loss": 1.4817, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 6.846999645233154, | |
| "learning_rate": 1.738814070351759e-05, | |
| "loss": 1.473, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 4.799276351928711, | |
| "learning_rate": 1.7368040201005028e-05, | |
| "loss": 1.4969, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 7.644506931304932, | |
| "learning_rate": 1.7347939698492463e-05, | |
| "loss": 1.4836, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 4.794766426086426, | |
| "learning_rate": 1.73278391959799e-05, | |
| "loss": 1.4993, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 5.148614406585693, | |
| "learning_rate": 1.730773869346734e-05, | |
| "loss": 1.4697, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 3.6471970081329346, | |
| "learning_rate": 1.7287638190954777e-05, | |
| "loss": 1.4811, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 5.793773174285889, | |
| "learning_rate": 1.7267537688442214e-05, | |
| "loss": 1.4988, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 4.239154815673828, | |
| "learning_rate": 1.724743718592965e-05, | |
| "loss": 1.4673, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 5.415383815765381, | |
| "learning_rate": 1.7227336683417087e-05, | |
| "loss": 1.4966, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 4.704416275024414, | |
| "learning_rate": 1.720723618090452e-05, | |
| "loss": 1.4996, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.5393893718719482, | |
| "learning_rate": 1.7187336683417087e-05, | |
| "loss": 1.4677, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 3.5352883338928223, | |
| "learning_rate": 1.7167236180904522e-05, | |
| "loss": 1.4739, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 5.679812431335449, | |
| "learning_rate": 1.7147135678391963e-05, | |
| "loss": 1.4556, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": Infinity, | |
| "learning_rate": 1.7127236180904526e-05, | |
| "loss": 1.4665, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 5.901428699493408, | |
| "learning_rate": 1.710713567839196e-05, | |
| "loss": 1.464, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 4.5120744705200195, | |
| "learning_rate": 1.70870351758794e-05, | |
| "loss": 1.4752, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 4.715979099273682, | |
| "learning_rate": 1.7066934673366836e-05, | |
| "loss": 1.4632, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 5.309842109680176, | |
| "learning_rate": 1.704683417085427e-05, | |
| "loss": 1.4474, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 4.106339454650879, | |
| "learning_rate": 1.7026733668341712e-05, | |
| "loss": 1.4706, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 9.052672386169434, | |
| "learning_rate": 1.7006633165829147e-05, | |
| "loss": 1.4662, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 5.6153059005737305, | |
| "learning_rate": 1.6986532663316585e-05, | |
| "loss": 1.485, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 4.049362659454346, | |
| "learning_rate": 1.696643216080402e-05, | |
| "loss": 1.4847, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.916749477386475, | |
| "learning_rate": 1.694633165829146e-05, | |
| "loss": 1.4487, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.682246685028076, | |
| "learning_rate": 1.6926231155778895e-05, | |
| "loss": 1.4227, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.3342747688293457, | |
| "learning_rate": 1.6906130653266333e-05, | |
| "loss": 1.4564, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 5.837874412536621, | |
| "learning_rate": 1.6886030150753768e-05, | |
| "loss": 1.4759, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 4.549025058746338, | |
| "learning_rate": 1.686592964824121e-05, | |
| "loss": 1.4381, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 6.344630718231201, | |
| "learning_rate": 1.6845829145728644e-05, | |
| "loss": 1.4824, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 4.8240485191345215, | |
| "learning_rate": 1.6825728643216082e-05, | |
| "loss": 1.4631, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 4.205628395080566, | |
| "learning_rate": 1.680562814070352e-05, | |
| "loss": 1.4683, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 4.132819652557373, | |
| "learning_rate": 1.6785527638190958e-05, | |
| "loss": 1.4209, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 4.3151140213012695, | |
| "learning_rate": 1.6765427135678392e-05, | |
| "loss": 1.4468, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 5.677152633666992, | |
| "learning_rate": 1.674532663316583e-05, | |
| "loss": 1.4198, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 4.871775150299072, | |
| "learning_rate": 1.672522613065327e-05, | |
| "loss": 1.4904, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.693517208099365, | |
| "learning_rate": 1.6705125628140706e-05, | |
| "loss": 1.4592, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 4.093634605407715, | |
| "learning_rate": 1.668502512562814e-05, | |
| "loss": 1.4561, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 6.508328437805176, | |
| "learning_rate": 1.666492462311558e-05, | |
| "loss": 1.456, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 8.67950439453125, | |
| "learning_rate": 1.6644824120603017e-05, | |
| "loss": 1.4432, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 6.435894012451172, | |
| "learning_rate": 1.662472361809045e-05, | |
| "loss": 1.4807, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 4.387815952301025, | |
| "learning_rate": 1.6604623115577893e-05, | |
| "loss": 1.4171, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 5.042853832244873, | |
| "learning_rate": 1.6584522613065327e-05, | |
| "loss": 1.4361, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 4.579937934875488, | |
| "learning_rate": 1.6564422110552765e-05, | |
| "loss": 1.4752, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 8.229300498962402, | |
| "learning_rate": 1.65443216080402e-05, | |
| "loss": 1.4058, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 5.82681131362915, | |
| "learning_rate": 1.652422110552764e-05, | |
| "loss": 1.4353, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.8094637393951416, | |
| "learning_rate": 1.6504120603015076e-05, | |
| "loss": 1.4043, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 5.825170993804932, | |
| "learning_rate": 1.6484020100502514e-05, | |
| "loss": 1.458, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 5.906398773193359, | |
| "learning_rate": 1.6463919597989952e-05, | |
| "loss": 1.4373, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 4.367284774780273, | |
| "learning_rate": 1.644381909547739e-05, | |
| "loss": 1.4119, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 4.767496585845947, | |
| "learning_rate": 1.6423718592964824e-05, | |
| "loss": 1.4378, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.3912670612335205, | |
| "learning_rate": 1.6403618090452262e-05, | |
| "loss": 1.4452, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 4.836172580718994, | |
| "learning_rate": 1.63835175879397e-05, | |
| "loss": 1.4054, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 7.216467380523682, | |
| "learning_rate": 1.636341708542714e-05, | |
| "loss": 1.4604, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 4.356799602508545, | |
| "learning_rate": 1.6343316582914573e-05, | |
| "loss": 1.4552, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 3.337068557739258, | |
| "learning_rate": 1.632321608040201e-05, | |
| "loss": 1.4597, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 4.059195518493652, | |
| "learning_rate": 1.630311557788945e-05, | |
| "loss": 1.467, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 3.697249412536621, | |
| "learning_rate": 1.6283015075376887e-05, | |
| "loss": 1.445, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 6.429022789001465, | |
| "learning_rate": 1.6262914572864325e-05, | |
| "loss": 1.4536, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 5.085973739624023, | |
| "learning_rate": 1.624281407035176e-05, | |
| "loss": 1.4507, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 4.309168815612793, | |
| "learning_rate": 1.6222713567839197e-05, | |
| "loss": 1.4319, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 5.077241897583008, | |
| "learning_rate": 1.6202613065326635e-05, | |
| "loss": 1.4103, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 3.984090566635132, | |
| "learning_rate": 1.6182512562814073e-05, | |
| "loss": 1.4104, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 4.95877742767334, | |
| "learning_rate": 1.6162412060301508e-05, | |
| "loss": 1.4656, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 4.357282638549805, | |
| "learning_rate": 1.6142311557788946e-05, | |
| "loss": 1.4453, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 5.499750137329102, | |
| "learning_rate": 1.6122211055276384e-05, | |
| "loss": 1.409, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 4.081977367401123, | |
| "learning_rate": 1.6102110552763822e-05, | |
| "loss": 1.41, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.961399078369141, | |
| "learning_rate": 1.6082010050251256e-05, | |
| "loss": 1.393, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.431087613105774, | |
| "eval_runtime": 21.8687, | |
| "eval_samples_per_second": 45.727, | |
| "eval_steps_per_second": 5.716, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.539051532745361, | |
| "learning_rate": 1.6062110552763822e-05, | |
| "loss": 1.4339, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 7.362614631652832, | |
| "learning_rate": 1.6042010050251257e-05, | |
| "loss": 1.4047, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 4.155520439147949, | |
| "learning_rate": 1.6021909547738695e-05, | |
| "loss": 1.3943, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 4.347718715667725, | |
| "learning_rate": 1.600180904522613e-05, | |
| "loss": 1.432, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 4.478184700012207, | |
| "learning_rate": 1.598170854271357e-05, | |
| "loss": 1.4201, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 8.239706993103027, | |
| "learning_rate": 1.5961608040201005e-05, | |
| "loss": 1.3944, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 6.581277370452881, | |
| "learning_rate": 1.5941507537688443e-05, | |
| "loss": 1.359, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 4.083044528961182, | |
| "learning_rate": 1.592140703517588e-05, | |
| "loss": 1.4004, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 5.052839756011963, | |
| "learning_rate": 1.590130653266332e-05, | |
| "loss": 1.4157, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 3.8107857704162598, | |
| "learning_rate": 1.5881206030150754e-05, | |
| "loss": 1.3823, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 6.1900954246521, | |
| "learning_rate": 1.5861105527638192e-05, | |
| "loss": 1.4194, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 4.510327339172363, | |
| "learning_rate": 1.584100502512563e-05, | |
| "loss": 1.4173, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 6.412552833557129, | |
| "learning_rate": 1.5820904522613068e-05, | |
| "loss": 1.3996, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 5.167262077331543, | |
| "learning_rate": 1.5800804020100506e-05, | |
| "loss": 1.4007, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 4.012689590454102, | |
| "learning_rate": 1.578070351758794e-05, | |
| "loss": 1.3875, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 6.7843017578125, | |
| "learning_rate": 1.5760603015075378e-05, | |
| "loss": 1.4192, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 5.4605207443237305, | |
| "learning_rate": 1.5740502512562816e-05, | |
| "loss": 1.421, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 8.303611755371094, | |
| "learning_rate": 1.5720402010050254e-05, | |
| "loss": 1.45, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 4.898472309112549, | |
| "learning_rate": 1.570030150753769e-05, | |
| "loss": 1.3982, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 6.471447467803955, | |
| "learning_rate": 1.5680201005025127e-05, | |
| "loss": 1.4272, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 7.5459885597229, | |
| "learning_rate": 1.5660100502512565e-05, | |
| "loss": 1.3861, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 7.108932971954346, | |
| "learning_rate": 1.5640000000000003e-05, | |
| "loss": 1.3946, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 5.083498954772949, | |
| "learning_rate": 1.5619899497487437e-05, | |
| "loss": 1.4006, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 6.796627998352051, | |
| "learning_rate": 1.5599798994974875e-05, | |
| "loss": 1.4266, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 5.1619367599487305, | |
| "learning_rate": 1.5579698492462313e-05, | |
| "loss": 1.3567, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 5.548572063446045, | |
| "learning_rate": 1.555959798994975e-05, | |
| "loss": 1.4193, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 4.019492149353027, | |
| "learning_rate": 1.5539497487437186e-05, | |
| "loss": 1.3988, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 5.359696865081787, | |
| "learning_rate": 1.5519396984924624e-05, | |
| "loss": 1.4046, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 3.4442272186279297, | |
| "learning_rate": 1.5499296482412062e-05, | |
| "loss": 1.3924, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 5.838873863220215, | |
| "learning_rate": 1.5479396984924624e-05, | |
| "loss": 1.4141, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 5.01621150970459, | |
| "learning_rate": 1.5459296482412062e-05, | |
| "loss": 1.3658, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 5.7665205001831055, | |
| "learning_rate": 1.54391959798995e-05, | |
| "loss": 1.4245, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 3.1527726650238037, | |
| "learning_rate": 1.5419095477386935e-05, | |
| "loss": 1.3695, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 6.3304924964904785, | |
| "learning_rate": 1.5398994974874373e-05, | |
| "loss": 1.3978, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 7.042291164398193, | |
| "learning_rate": 1.537889447236181e-05, | |
| "loss": 1.385, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 4.397637844085693, | |
| "learning_rate": 1.535879396984925e-05, | |
| "loss": 1.3726, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 3.7688262462615967, | |
| "learning_rate": 1.5338693467336687e-05, | |
| "loss": 1.3896, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 4.961839199066162, | |
| "learning_rate": 1.531859296482412e-05, | |
| "loss": 1.3792, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 4.08626127243042, | |
| "learning_rate": 1.529849246231156e-05, | |
| "loss": 1.3507, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 4.055938243865967, | |
| "learning_rate": 1.5278391959798997e-05, | |
| "loss": 1.3873, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 5.093524932861328, | |
| "learning_rate": 1.5258291457286433e-05, | |
| "loss": 1.3763, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 5.755058288574219, | |
| "learning_rate": 1.523819095477387e-05, | |
| "loss": 1.4164, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 4.845275402069092, | |
| "learning_rate": 1.5218090452261308e-05, | |
| "loss": 1.3103, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 5.180044174194336, | |
| "learning_rate": 1.5197989949748746e-05, | |
| "loss": 1.3739, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 5.913352012634277, | |
| "learning_rate": 1.5178090452261306e-05, | |
| "loss": 1.3946, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 6.644520282745361, | |
| "learning_rate": 1.5157989949748746e-05, | |
| "loss": 1.3683, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 4.617815017700195, | |
| "learning_rate": 1.5137889447236182e-05, | |
| "loss": 1.3588, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.690709114074707, | |
| "learning_rate": 1.5117788944723619e-05, | |
| "loss": 1.4102, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 4.181049823760986, | |
| "learning_rate": 1.5097688442211057e-05, | |
| "loss": 1.3777, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 5.829825401306152, | |
| "learning_rate": 1.5077587939698495e-05, | |
| "loss": 1.4278, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 4.409423351287842, | |
| "learning_rate": 1.505748743718593e-05, | |
| "loss": 1.3751, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 5.290346145629883, | |
| "learning_rate": 1.5037386934673369e-05, | |
| "loss": 1.3752, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 6.922583103179932, | |
| "learning_rate": 1.5017286432160805e-05, | |
| "loss": 1.3756, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 4.969797134399414, | |
| "learning_rate": 1.4997185929648241e-05, | |
| "loss": 1.3808, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 4.4493184089660645, | |
| "learning_rate": 1.4977085427135681e-05, | |
| "loss": 1.3836, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 3.4044313430786133, | |
| "learning_rate": 1.4956984924623117e-05, | |
| "loss": 1.3737, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 3.3968327045440674, | |
| "learning_rate": 1.4936884422110554e-05, | |
| "loss": 1.3776, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 3.016774892807007, | |
| "learning_rate": 1.491678391959799e-05, | |
| "loss": 1.3642, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 6.324804306030273, | |
| "learning_rate": 1.489668341708543e-05, | |
| "loss": 1.3838, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 3.8945064544677734, | |
| "learning_rate": 1.4876582914572866e-05, | |
| "loss": 1.339, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 6.601470470428467, | |
| "learning_rate": 1.4856482412060302e-05, | |
| "loss": 1.3609, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 5.291379928588867, | |
| "learning_rate": 1.4836381909547738e-05, | |
| "loss": 1.3829, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 5.4891462326049805, | |
| "learning_rate": 1.4816281407035178e-05, | |
| "loss": 1.3733, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 4.400446891784668, | |
| "learning_rate": 1.4796180904522614e-05, | |
| "loss": 1.3601, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 5.4064860343933105, | |
| "learning_rate": 1.477608040201005e-05, | |
| "loss": 1.3922, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 6.4848737716674805, | |
| "learning_rate": 1.4755979899497489e-05, | |
| "loss": 1.3839, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 3.8651046752929688, | |
| "learning_rate": 1.4735879396984927e-05, | |
| "loss": 1.3798, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 6.21872615814209, | |
| "learning_rate": 1.4715778894472363e-05, | |
| "loss": 1.3771, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 4.698353290557861, | |
| "learning_rate": 1.46956783919598e-05, | |
| "loss": 1.3794, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 6.222665309906006, | |
| "learning_rate": 1.4675577889447237e-05, | |
| "loss": 1.3603, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 5.659895896911621, | |
| "learning_rate": 1.4655477386934675e-05, | |
| "loss": 1.4091, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 5.340900897979736, | |
| "learning_rate": 1.4635376884422113e-05, | |
| "loss": 1.385, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.827996253967285, | |
| "learning_rate": 1.461527638190955e-05, | |
| "loss": 1.3724, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 5.637544631958008, | |
| "learning_rate": 1.4595175879396986e-05, | |
| "loss": 1.3911, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 3.459794282913208, | |
| "learning_rate": 1.4575075376884422e-05, | |
| "loss": 1.3946, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 6.612933158874512, | |
| "learning_rate": 1.4554974874371862e-05, | |
| "loss": 1.404, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 4.6960577964782715, | |
| "learning_rate": 1.4534874371859298e-05, | |
| "loss": 1.3571, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 3.677015781402588, | |
| "learning_rate": 1.4514773869346734e-05, | |
| "loss": 1.4166, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 4.411760330200195, | |
| "learning_rate": 1.449467336683417e-05, | |
| "loss": 1.3666, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 4.236432075500488, | |
| "learning_rate": 1.447457286432161e-05, | |
| "loss": 1.3738, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 7.484130859375, | |
| "learning_rate": 1.4454472361809046e-05, | |
| "loss": 1.3454, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 4.38557243347168, | |
| "learning_rate": 1.4434371859296483e-05, | |
| "loss": 1.3842, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 5.947939395904541, | |
| "learning_rate": 1.441427135678392e-05, | |
| "loss": 1.3391, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 4.879386901855469, | |
| "learning_rate": 1.4394170854271359e-05, | |
| "loss": 1.3337, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 5.369794845581055, | |
| "learning_rate": 1.4374070351758795e-05, | |
| "loss": 1.371, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 6.312124252319336, | |
| "learning_rate": 1.4353969849246233e-05, | |
| "loss": 1.3321, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 5.254230976104736, | |
| "learning_rate": 1.4333869346733669e-05, | |
| "loss": 1.3835, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 5.279263496398926, | |
| "learning_rate": 1.4313768844221107e-05, | |
| "loss": 1.341, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 3.9145216941833496, | |
| "learning_rate": 1.4293668341708545e-05, | |
| "loss": 1.3457, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 18.773277282714844, | |
| "learning_rate": 1.4273768844221107e-05, | |
| "loss": 1.3617, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.489799499511719, | |
| "learning_rate": 1.4253668341708544e-05, | |
| "loss": 1.3604, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 3.820908784866333, | |
| "learning_rate": 1.423356783919598e-05, | |
| "loss": 1.3688, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 5.470434188842773, | |
| "learning_rate": 1.4213467336683418e-05, | |
| "loss": 1.4041, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 2.9653820991516113, | |
| "learning_rate": 1.4193366834170856e-05, | |
| "loss": 1.3593, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 4.433176517486572, | |
| "learning_rate": 1.4173266331658292e-05, | |
| "loss": 1.3402, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 3.5363194942474365, | |
| "learning_rate": 1.415316582914573e-05, | |
| "loss": 1.376, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 6.819579601287842, | |
| "learning_rate": 1.4133065326633166e-05, | |
| "loss": 1.3203, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 5.506997585296631, | |
| "learning_rate": 1.4112964824120604e-05, | |
| "loss": 1.3234, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 6.012782573699951, | |
| "learning_rate": 1.4092864321608042e-05, | |
| "loss": 1.3625, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.309823751449585, | |
| "learning_rate": 1.4072763819095479e-05, | |
| "loss": 1.351, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.3539750576019287, | |
| "eval_runtime": 21.8688, | |
| "eval_samples_per_second": 45.727, | |
| "eval_steps_per_second": 5.716, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 5.257068157196045, | |
| "learning_rate": 1.4052663316582915e-05, | |
| "loss": 1.348, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 13.082318305969238, | |
| "learning_rate": 1.4032562814070351e-05, | |
| "loss": 1.3791, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 4.944590091705322, | |
| "learning_rate": 1.4012462311557791e-05, | |
| "loss": 1.3805, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 4.780072212219238, | |
| "learning_rate": 1.3992361809045227e-05, | |
| "loss": 1.3651, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 4.359679698944092, | |
| "learning_rate": 1.3972261306532664e-05, | |
| "loss": 1.3118, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 4.789872646331787, | |
| "learning_rate": 1.3952160804020101e-05, | |
| "loss": 1.3662, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 4.301767349243164, | |
| "learning_rate": 1.393206030150754e-05, | |
| "loss": 1.3564, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 4.046327590942383, | |
| "learning_rate": 1.3911959798994976e-05, | |
| "loss": 1.3386, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 6.321465969085693, | |
| "learning_rate": 1.3891859296482412e-05, | |
| "loss": 1.3544, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 5.538000106811523, | |
| "learning_rate": 1.387175879396985e-05, | |
| "loss": 1.3466, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 6.25814151763916, | |
| "learning_rate": 1.3851658291457288e-05, | |
| "loss": 1.3246, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 5.343544006347656, | |
| "learning_rate": 1.3831758793969849e-05, | |
| "loss": 1.3629, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 4.390071868896484, | |
| "learning_rate": 1.3811658291457288e-05, | |
| "loss": 1.3211, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 5.539604187011719, | |
| "learning_rate": 1.3791557788944725e-05, | |
| "loss": 1.3297, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 4.06265115737915, | |
| "learning_rate": 1.3771457286432161e-05, | |
| "loss": 1.37, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 4.819797515869141, | |
| "learning_rate": 1.3751356783919599e-05, | |
| "loss": 1.3279, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 4.675261497497559, | |
| "learning_rate": 1.3731256281407037e-05, | |
| "loss": 1.321, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 6.112530708312988, | |
| "learning_rate": 1.3711155778894473e-05, | |
| "loss": 1.3321, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 4.85811185836792, | |
| "learning_rate": 1.3691055276381911e-05, | |
| "loss": 1.36, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 3.9626624584198, | |
| "learning_rate": 1.3670954773869347e-05, | |
| "loss": 1.3446, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 4.470461845397949, | |
| "learning_rate": 1.3650854271356785e-05, | |
| "loss": 1.3503, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 3.1345880031585693, | |
| "learning_rate": 1.3630753768844223e-05, | |
| "loss": 1.322, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 4.220657825469971, | |
| "learning_rate": 1.361065326633166e-05, | |
| "loss": 1.2922, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 4.827053546905518, | |
| "learning_rate": 1.3590552763819096e-05, | |
| "loss": 1.3663, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 3.613919496536255, | |
| "learning_rate": 1.3570452261306536e-05, | |
| "loss": 1.359, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 6.170840263366699, | |
| "learning_rate": 1.3550351758793972e-05, | |
| "loss": 1.3076, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 5.604345321655273, | |
| "learning_rate": 1.3530251256281408e-05, | |
| "loss": 1.3335, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 3.617830514907837, | |
| "learning_rate": 1.3510150753768844e-05, | |
| "loss": 1.3601, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 5.692416191101074, | |
| "learning_rate": 1.349005025125628e-05, | |
| "loss": 1.3117, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 6.292971611022949, | |
| "learning_rate": 1.346994974874372e-05, | |
| "loss": 1.3427, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 4.1335906982421875, | |
| "learning_rate": 1.3449849246231157e-05, | |
| "loss": 1.3593, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 3.7386412620544434, | |
| "learning_rate": 1.3429748743718593e-05, | |
| "loss": 1.3509, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 4.687217712402344, | |
| "learning_rate": 1.3409648241206031e-05, | |
| "loss": 1.3331, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 3.9221270084381104, | |
| "learning_rate": 1.3389547738693469e-05, | |
| "loss": 1.3299, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 4.383951663970947, | |
| "learning_rate": 1.3369447236180905e-05, | |
| "loss": 1.3455, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 4.3801140785217285, | |
| "learning_rate": 1.3349346733668343e-05, | |
| "loss": 1.3092, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 11.051312446594238, | |
| "learning_rate": 1.332924623115578e-05, | |
| "loss": 1.3164, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 5.16642951965332, | |
| "learning_rate": 1.3309346733668342e-05, | |
| "loss": 1.3463, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 4.878006935119629, | |
| "learning_rate": 1.328924623115578e-05, | |
| "loss": 1.3452, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 3.8694944381713867, | |
| "learning_rate": 1.3269145728643218e-05, | |
| "loss": 1.3393, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 3.1603684425354004, | |
| "learning_rate": 1.3249045226130654e-05, | |
| "loss": 1.3608, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 7.4358086585998535, | |
| "learning_rate": 1.3228944723618092e-05, | |
| "loss": 1.334, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 3.47105073928833, | |
| "learning_rate": 1.3208844221105528e-05, | |
| "loss": 1.3394, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.943812847137451, | |
| "learning_rate": 1.3188743718592966e-05, | |
| "loss": 1.3316, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 3.4260807037353516, | |
| "learning_rate": 1.3168643216080404e-05, | |
| "loss": 1.2544, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 5.455039978027344, | |
| "learning_rate": 1.314854271356784e-05, | |
| "loss": 1.3122, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 5.853464603424072, | |
| "learning_rate": 1.3128442211055277e-05, | |
| "loss": 1.3533, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 6.121273994445801, | |
| "learning_rate": 1.3108341708542715e-05, | |
| "loss": 1.3144, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.645484447479248, | |
| "learning_rate": 1.3088241206030153e-05, | |
| "loss": 1.3154, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 5.509624004364014, | |
| "learning_rate": 1.3068140703517589e-05, | |
| "loss": 1.3335, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 4.298194885253906, | |
| "learning_rate": 1.3048040201005025e-05, | |
| "loss": 1.3374, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 3.8274903297424316, | |
| "learning_rate": 1.3027939698492465e-05, | |
| "loss": 1.3577, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 3.6994845867156982, | |
| "learning_rate": 1.3007839195979901e-05, | |
| "loss": 1.3439, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.096419334411621, | |
| "learning_rate": 1.2987738693467338e-05, | |
| "loss": 1.3425, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.432124614715576, | |
| "learning_rate": 1.2967638190954774e-05, | |
| "loss": 1.3089, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 6.754784107208252, | |
| "learning_rate": 1.2947537688442212e-05, | |
| "loss": 1.3577, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 5.67179012298584, | |
| "learning_rate": 1.292743718592965e-05, | |
| "loss": 1.3158, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 5.4324517250061035, | |
| "learning_rate": 1.2907336683417086e-05, | |
| "loss": 1.3423, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 6.366858959197998, | |
| "learning_rate": 1.2887236180904524e-05, | |
| "loss": 1.348, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 4.625099182128906, | |
| "learning_rate": 1.286713567839196e-05, | |
| "loss": 1.344, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 4.306463718414307, | |
| "learning_rate": 1.2847035175879398e-05, | |
| "loss": 1.3471, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 5.759509086608887, | |
| "learning_rate": 1.2826934673366835e-05, | |
| "loss": 1.2994, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 6.741412162780762, | |
| "learning_rate": 1.2806834170854273e-05, | |
| "loss": 1.3385, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 8.009258270263672, | |
| "learning_rate": 1.2786733668341709e-05, | |
| "loss": 1.307, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 4.115825653076172, | |
| "learning_rate": 1.2766633165829147e-05, | |
| "loss": 1.3514, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 6.015181541442871, | |
| "learning_rate": 1.2746532663316585e-05, | |
| "loss": 1.3126, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 5.512998104095459, | |
| "learning_rate": 1.2726633165829147e-05, | |
| "loss": 1.3288, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 4.820090293884277, | |
| "learning_rate": 1.2706532663316583e-05, | |
| "loss": 1.322, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 5.174429893493652, | |
| "learning_rate": 1.2686432160804021e-05, | |
| "loss": 1.3451, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 5.226765155792236, | |
| "learning_rate": 1.2666331658291458e-05, | |
| "loss": 1.3362, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 7.082357406616211, | |
| "learning_rate": 1.2646231155778896e-05, | |
| "loss": 1.3053, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 5.663987636566162, | |
| "learning_rate": 1.2626130653266334e-05, | |
| "loss": 1.3312, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 4.99432897567749, | |
| "learning_rate": 1.260603015075377e-05, | |
| "loss": 1.2969, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 3.6237847805023193, | |
| "learning_rate": 1.2585929648241206e-05, | |
| "loss": 1.3284, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 4.317774772644043, | |
| "learning_rate": 1.2565829145728646e-05, | |
| "loss": 1.2724, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 28.135671615600586, | |
| "learning_rate": 1.2545728643216082e-05, | |
| "loss": 1.301, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 6.461686611175537, | |
| "learning_rate": 1.2525628140703518e-05, | |
| "loss": 1.311, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 5.122781276702881, | |
| "learning_rate": 1.2505527638190955e-05, | |
| "loss": 1.3124, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 7.070276737213135, | |
| "learning_rate": 1.2485628140703519e-05, | |
| "loss": 1.3042, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 5.5672607421875, | |
| "learning_rate": 1.2465527638190955e-05, | |
| "loss": 1.2705, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 5.857154369354248, | |
| "learning_rate": 1.2445427135678395e-05, | |
| "loss": 1.2842, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 7.3641133308410645, | |
| "learning_rate": 1.2425326633165831e-05, | |
| "loss": 1.3284, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 7.004215717315674, | |
| "learning_rate": 1.2405226130653267e-05, | |
| "loss": 1.3106, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 6.171952247619629, | |
| "learning_rate": 1.2385125628140704e-05, | |
| "loss": 1.2711, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 4.094524383544922, | |
| "learning_rate": 1.236502512562814e-05, | |
| "loss": 1.2879, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 6.0571980476379395, | |
| "learning_rate": 1.234492462311558e-05, | |
| "loss": 1.3454, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.057672023773193, | |
| "learning_rate": 1.2324824120603016e-05, | |
| "loss": 1.3275, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 6.506563663482666, | |
| "learning_rate": 1.2304723618090452e-05, | |
| "loss": 1.2866, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 5.117976188659668, | |
| "learning_rate": 1.228462311557789e-05, | |
| "loss": 1.2811, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 4.050692558288574, | |
| "learning_rate": 1.2264522613065328e-05, | |
| "loss": 1.2682, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 4.216948509216309, | |
| "learning_rate": 1.2244422110552764e-05, | |
| "loss": 1.3079, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 5.064427375793457, | |
| "learning_rate": 1.2224321608040202e-05, | |
| "loss": 1.3022, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.869637489318848, | |
| "learning_rate": 1.2204221105527639e-05, | |
| "loss": 1.294, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 4.062154769897461, | |
| "learning_rate": 1.2184120603015077e-05, | |
| "loss": 1.3249, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.1579976081848145, | |
| "learning_rate": 1.2164020100502515e-05, | |
| "loss": 1.3006, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 6.146691799163818, | |
| "learning_rate": 1.214391959798995e-05, | |
| "loss": 1.317, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.503583908081055, | |
| "learning_rate": 1.2123819095477387e-05, | |
| "loss": 1.2803, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 5.574082374572754, | |
| "learning_rate": 1.2103718592964827e-05, | |
| "loss": 1.3149, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 7.74934720993042, | |
| "learning_rate": 1.2083618090452263e-05, | |
| "loss": 1.3085, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.324882984161377, | |
| "learning_rate": 1.20635175879397e-05, | |
| "loss": 1.2662, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.311901330947876, | |
| "eval_runtime": 21.8453, | |
| "eval_samples_per_second": 45.776, | |
| "eval_steps_per_second": 5.722, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 4.58272647857666, | |
| "learning_rate": 1.2043417085427136e-05, | |
| "loss": 1.251, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 6.063676357269287, | |
| "learning_rate": 1.2023316582914575e-05, | |
| "loss": 1.3191, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 6.219407558441162, | |
| "learning_rate": 1.2003216080402012e-05, | |
| "loss": 1.2535, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 4.881536960601807, | |
| "learning_rate": 1.1983115577889448e-05, | |
| "loss": 1.3005, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.7239251136779785, | |
| "learning_rate": 1.1963015075376884e-05, | |
| "loss": 1.2162, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 4.018795490264893, | |
| "learning_rate": 1.1942914572864324e-05, | |
| "loss": 1.2869, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 5.884511947631836, | |
| "learning_rate": 1.192281407035176e-05, | |
| "loss": 1.3242, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 6.765765190124512, | |
| "learning_rate": 1.1902713567839196e-05, | |
| "loss": 1.3089, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 7.366244316101074, | |
| "learning_rate": 1.1882613065326634e-05, | |
| "loss": 1.2574, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 4.718629837036133, | |
| "learning_rate": 1.186251256281407e-05, | |
| "loss": 1.2967, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 3.110818386077881, | |
| "learning_rate": 1.1842412060301509e-05, | |
| "loss": 1.2619, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 5.5501298904418945, | |
| "learning_rate": 1.1822311557788947e-05, | |
| "loss": 1.2895, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 4.115253925323486, | |
| "learning_rate": 1.1802412060301509e-05, | |
| "loss": 1.2978, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 4.711161136627197, | |
| "learning_rate": 1.1782311557788945e-05, | |
| "loss": 1.2846, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 4.220121383666992, | |
| "learning_rate": 1.1762211055276383e-05, | |
| "loss": 1.2873, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.8940072059631348, | |
| "learning_rate": 1.174211055276382e-05, | |
| "loss": 1.2794, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 4.298203468322754, | |
| "learning_rate": 1.1722010050251257e-05, | |
| "loss": 1.2412, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.983751058578491, | |
| "learning_rate": 1.1701909547738694e-05, | |
| "loss": 1.2379, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 5.402541637420654, | |
| "learning_rate": 1.1681809045226132e-05, | |
| "loss": 1.2579, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 4.412992000579834, | |
| "learning_rate": 1.1661708542713568e-05, | |
| "loss": 1.2576, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 3.6417946815490723, | |
| "learning_rate": 1.1641608040201006e-05, | |
| "loss": 1.2711, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 4.4454216957092285, | |
| "learning_rate": 1.1621507537688444e-05, | |
| "loss": 1.2823, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.299724578857422, | |
| "learning_rate": 1.160140703517588e-05, | |
| "loss": 1.2559, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 4.227545261383057, | |
| "learning_rate": 1.1581306532663317e-05, | |
| "loss": 1.2321, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.005281925201416, | |
| "learning_rate": 1.1561206030150756e-05, | |
| "loss": 1.2988, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 5.402487754821777, | |
| "learning_rate": 1.1541105527638192e-05, | |
| "loss": 1.2935, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 4.937114715576172, | |
| "learning_rate": 1.1521005025125629e-05, | |
| "loss": 1.2786, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 5.078079700469971, | |
| "learning_rate": 1.1500904522613065e-05, | |
| "loss": 1.2735, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 3.767037868499756, | |
| "learning_rate": 1.1480804020100505e-05, | |
| "loss": 1.2557, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 5.843142032623291, | |
| "learning_rate": 1.1460703517587941e-05, | |
| "loss": 1.2338, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 7.4397053718566895, | |
| "learning_rate": 1.1440804020100505e-05, | |
| "loss": 1.2841, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 5.660872459411621, | |
| "learning_rate": 1.1420703517587941e-05, | |
| "loss": 1.245, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 5.597433567047119, | |
| "learning_rate": 1.1400603015075378e-05, | |
| "loss": 1.2267, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 5.053484916687012, | |
| "learning_rate": 1.1380502512562814e-05, | |
| "loss": 1.2734, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 4.610946178436279, | |
| "learning_rate": 1.1360402010050254e-05, | |
| "loss": 1.2443, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 4.457014083862305, | |
| "learning_rate": 1.134030150753769e-05, | |
| "loss": 1.2665, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 4.798270225524902, | |
| "learning_rate": 1.1320201005025126e-05, | |
| "loss": 1.3062, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 6.110182762145996, | |
| "learning_rate": 1.1300100502512562e-05, | |
| "loss": 1.2652, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 5.528191089630127, | |
| "learning_rate": 1.128e-05, | |
| "loss": 1.2886, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 6.199995994567871, | |
| "learning_rate": 1.1259899497487438e-05, | |
| "loss": 1.2431, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 4.476943492889404, | |
| "learning_rate": 1.1239798994974875e-05, | |
| "loss": 1.292, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 4.873668670654297, | |
| "learning_rate": 1.1219698492462313e-05, | |
| "loss": 1.3071, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 4.769585609436035, | |
| "learning_rate": 1.1199597989949749e-05, | |
| "loss": 1.2568, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 3.5369479656219482, | |
| "learning_rate": 1.1179497487437187e-05, | |
| "loss": 1.2472, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 3.9671337604522705, | |
| "learning_rate": 1.1159396984924625e-05, | |
| "loss": 1.2917, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.720705986022949, | |
| "learning_rate": 1.1139296482412061e-05, | |
| "loss": 1.235, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.337419509887695, | |
| "learning_rate": 1.1119195979899497e-05, | |
| "loss": 1.3176, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.101902961730957, | |
| "learning_rate": 1.1099095477386937e-05, | |
| "loss": 1.3028, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 4.49253511428833, | |
| "learning_rate": 1.1078994974874373e-05, | |
| "loss": 1.2426, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 6.949131011962891, | |
| "learning_rate": 1.105889447236181e-05, | |
| "loss": 1.2352, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 3.002480983734131, | |
| "learning_rate": 1.1038793969849246e-05, | |
| "loss": 1.2544, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 4.93209171295166, | |
| "learning_rate": 1.1018693467336686e-05, | |
| "loss": 1.2528, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 3.5885465145111084, | |
| "learning_rate": 1.0998592964824122e-05, | |
| "loss": 1.2395, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 6.264705657958984, | |
| "learning_rate": 1.0978492462311558e-05, | |
| "loss": 1.2468, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 4.435594081878662, | |
| "learning_rate": 1.0958391959798994e-05, | |
| "loss": 1.2523, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 2.9626126289367676, | |
| "learning_rate": 1.0938291457286434e-05, | |
| "loss": 1.2618, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 4.376198768615723, | |
| "learning_rate": 1.091819095477387e-05, | |
| "loss": 1.2738, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 4.058696269989014, | |
| "learning_rate": 1.0898090452261307e-05, | |
| "loss": 1.2787, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 5.347177982330322, | |
| "learning_rate": 1.087819095477387e-05, | |
| "loss": 1.2778, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 3.85967755317688, | |
| "learning_rate": 1.0858090452261307e-05, | |
| "loss": 1.2746, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 5.233943939208984, | |
| "learning_rate": 1.0837989949748743e-05, | |
| "loss": 1.283, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 6.364080429077148, | |
| "learning_rate": 1.0817889447236183e-05, | |
| "loss": 1.2685, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 5.601933479309082, | |
| "learning_rate": 1.079778894472362e-05, | |
| "loss": 1.2214, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 7.273884296417236, | |
| "learning_rate": 1.0777688442211056e-05, | |
| "loss": 1.2522, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 4.99397611618042, | |
| "learning_rate": 1.0757587939698494e-05, | |
| "loss": 1.2779, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 6.805306434631348, | |
| "learning_rate": 1.073748743718593e-05, | |
| "loss": 1.2605, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 5.773606777191162, | |
| "learning_rate": 1.0717386934673368e-05, | |
| "loss": 1.2236, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 5.045441150665283, | |
| "learning_rate": 1.0697286432160806e-05, | |
| "loss": 1.2629, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 6.011552333831787, | |
| "learning_rate": 1.0677185929648242e-05, | |
| "loss": 1.2061, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 6.2456817626953125, | |
| "learning_rate": 1.0657085427135678e-05, | |
| "loss": 1.2532, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 4.08701229095459, | |
| "learning_rate": 1.0636984924623116e-05, | |
| "loss": 1.2701, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 4.472239017486572, | |
| "learning_rate": 1.0616884422110554e-05, | |
| "loss": 1.261, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 5.726062297821045, | |
| "learning_rate": 1.059678391959799e-05, | |
| "loss": 1.2506, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 5.6060686111450195, | |
| "learning_rate": 1.0576683417085427e-05, | |
| "loss": 1.2472, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 5.226354122161865, | |
| "learning_rate": 1.0556582914572867e-05, | |
| "loss": 1.2599, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 6.913018703460693, | |
| "learning_rate": 1.0536482412060303e-05, | |
| "loss": 1.2555, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 4.932835578918457, | |
| "learning_rate": 1.0516381909547739e-05, | |
| "loss": 1.2658, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 5.260751724243164, | |
| "learning_rate": 1.0496281407035175e-05, | |
| "loss": 1.3025, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 4.677539825439453, | |
| "learning_rate": 1.0476180904522615e-05, | |
| "loss": 1.2367, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.679705619812012, | |
| "learning_rate": 1.0456080402010051e-05, | |
| "loss": 1.2693, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 5.315084934234619, | |
| "learning_rate": 1.0435979899497488e-05, | |
| "loss": 1.3188, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 5.218776702880859, | |
| "learning_rate": 1.0415879396984926e-05, | |
| "loss": 1.25, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 3.9905712604522705, | |
| "learning_rate": 1.0395778894472364e-05, | |
| "loss": 1.2318, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 5.051150798797607, | |
| "learning_rate": 1.03756783919598e-05, | |
| "loss": 1.2564, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 4.899648666381836, | |
| "learning_rate": 1.0355577889447238e-05, | |
| "loss": 1.2616, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 5.728534698486328, | |
| "learning_rate": 1.0335477386934674e-05, | |
| "loss": 1.2387, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 4.747395038604736, | |
| "learning_rate": 1.0315376884422112e-05, | |
| "loss": 1.2352, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.0430312156677246, | |
| "learning_rate": 1.0295276381909548e-05, | |
| "loss": 1.2302, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 4.847692012786865, | |
| "learning_rate": 1.0275175879396986e-05, | |
| "loss": 1.23, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 4.695472240447998, | |
| "learning_rate": 1.0255075376884423e-05, | |
| "loss": 1.2543, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 4.463906764984131, | |
| "learning_rate": 1.0234974874371859e-05, | |
| "loss": 1.2596, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 4.5606770515441895, | |
| "learning_rate": 1.0214874371859299e-05, | |
| "loss": 1.235, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 5.294122219085693, | |
| "learning_rate": 1.0194773869346735e-05, | |
| "loss": 1.2505, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 3.5599803924560547, | |
| "learning_rate": 1.0174673366834171e-05, | |
| "loss": 1.2817, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 3.720597982406616, | |
| "learning_rate": 1.0154572864321607e-05, | |
| "loss": 1.2628, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 4.774421215057373, | |
| "learning_rate": 1.0134472361809047e-05, | |
| "loss": 1.2705, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 5.8144330978393555, | |
| "learning_rate": 1.0114371859296483e-05, | |
| "loss": 1.2982, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 6.6790385246276855, | |
| "learning_rate": 1.009427135678392e-05, | |
| "loss": 1.2301, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 4.106939792633057, | |
| "learning_rate": 1.0074170854271358e-05, | |
| "loss": 1.2355, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 3.354093551635742, | |
| "learning_rate": 1.0054070351758796e-05, | |
| "loss": 1.2459, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 1.3116555213928223, | |
| "eval_runtime": 21.8654, | |
| "eval_samples_per_second": 45.734, | |
| "eval_steps_per_second": 5.717, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 5.311431407928467, | |
| "learning_rate": 1.0033969849246232e-05, | |
| "loss": 1.2402, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 5.163784503936768, | |
| "learning_rate": 1.0013869346733668e-05, | |
| "loss": 1.2329, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 5.684957981109619, | |
| "learning_rate": 9.993768844221106e-06, | |
| "loss": 1.2403, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 6.14009952545166, | |
| "learning_rate": 9.973668341708544e-06, | |
| "loss": 1.2217, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 6.6271796226501465, | |
| "learning_rate": 9.95356783919598e-06, | |
| "loss": 1.2122, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 4.011633396148682, | |
| "learning_rate": 9.933467336683418e-06, | |
| "loss": 1.2242, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 4.849850177764893, | |
| "learning_rate": 9.913366834170856e-06, | |
| "loss": 1.2349, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 3.841789484024048, | |
| "learning_rate": 9.893266331658293e-06, | |
| "loss": 1.2556, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 7.193374156951904, | |
| "learning_rate": 9.87316582914573e-06, | |
| "loss": 1.2119, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 3.556542158126831, | |
| "learning_rate": 9.853065326633167e-06, | |
| "loss": 1.2426, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 6.54746150970459, | |
| "learning_rate": 9.832964824120603e-06, | |
| "loss": 1.1936, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 4.4405951499938965, | |
| "learning_rate": 9.812864321608041e-06, | |
| "loss": 1.2591, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 5.398285865783691, | |
| "learning_rate": 9.792763819095477e-06, | |
| "loss": 1.2375, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 5.835482120513916, | |
| "learning_rate": 9.772864321608041e-06, | |
| "loss": 1.2626, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 5.3824543952941895, | |
| "learning_rate": 9.752763819095478e-06, | |
| "loss": 1.2145, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 4.847600936889648, | |
| "learning_rate": 9.732663316582916e-06, | |
| "loss": 1.2699, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 4.644218921661377, | |
| "learning_rate": 9.712562814070352e-06, | |
| "loss": 1.2669, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 5.046612739562988, | |
| "learning_rate": 9.69246231155779e-06, | |
| "loss": 1.2425, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 3.9644362926483154, | |
| "learning_rate": 9.672361809045226e-06, | |
| "loss": 1.2802, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 4.848786354064941, | |
| "learning_rate": 9.652261306532664e-06, | |
| "loss": 1.2257, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 5.159448623657227, | |
| "learning_rate": 9.6321608040201e-06, | |
| "loss": 1.2156, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 6.021341800689697, | |
| "learning_rate": 9.612060301507538e-06, | |
| "loss": 1.2122, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 7.450246334075928, | |
| "learning_rate": 9.591959798994975e-06, | |
| "loss": 1.2452, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 4.6322503089904785, | |
| "learning_rate": 9.571859296482413e-06, | |
| "loss": 1.2615, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 9.119827270507812, | |
| "learning_rate": 9.551758793969849e-06, | |
| "loss": 1.2484, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 7.022191047668457, | |
| "learning_rate": 9.531658291457287e-06, | |
| "loss": 1.2413, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 6.008714199066162, | |
| "learning_rate": 9.511557788944725e-06, | |
| "loss": 1.2545, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 4.883365631103516, | |
| "learning_rate": 9.491457286432161e-06, | |
| "loss": 1.256, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 4.666494846343994, | |
| "learning_rate": 9.4713567839196e-06, | |
| "loss": 1.212, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 6.942872524261475, | |
| "learning_rate": 9.451256281407035e-06, | |
| "loss": 1.2539, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 8.57226848602295, | |
| "learning_rate": 9.431155778894473e-06, | |
| "loss": 1.238, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 3.5034737586975098, | |
| "learning_rate": 9.411055276381911e-06, | |
| "loss": 1.1608, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 4.17569637298584, | |
| "learning_rate": 9.390954773869348e-06, | |
| "loss": 1.2335, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 5.2211012840271, | |
| "learning_rate": 9.370854271356786e-06, | |
| "loss": 1.2664, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 3.3811118602752686, | |
| "learning_rate": 9.350753768844222e-06, | |
| "loss": 1.2317, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 6.415603160858154, | |
| "learning_rate": 9.33065326633166e-06, | |
| "loss": 1.2613, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 4.720609188079834, | |
| "learning_rate": 9.310552763819096e-06, | |
| "loss": 1.2124, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 5.3697710037231445, | |
| "learning_rate": 9.290452261306533e-06, | |
| "loss": 1.2388, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 4.376136302947998, | |
| "learning_rate": 9.27035175879397e-06, | |
| "loss": 1.2495, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 3.973159074783325, | |
| "learning_rate": 9.250251256281407e-06, | |
| "loss": 1.2389, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 6.894681930541992, | |
| "learning_rate": 9.230150753768845e-06, | |
| "loss": 1.2056, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 4.781852722167969, | |
| "learning_rate": 9.210050251256281e-06, | |
| "loss": 1.2316, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 4.408322334289551, | |
| "learning_rate": 9.189949748743719e-06, | |
| "loss": 1.2945, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 4.875626564025879, | |
| "learning_rate": 9.169849246231157e-06, | |
| "loss": 1.2458, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 5.88706111907959, | |
| "learning_rate": 9.149748743718593e-06, | |
| "loss": 1.2257, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 4.785450458526611, | |
| "learning_rate": 9.129648241206031e-06, | |
| "loss": 1.2372, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 4.651752948760986, | |
| "learning_rate": 9.109547738693468e-06, | |
| "loss": 1.2146, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 5.303548336029053, | |
| "learning_rate": 9.089447236180905e-06, | |
| "loss": 1.2307, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 4.032742977142334, | |
| "learning_rate": 9.069346733668343e-06, | |
| "loss": 1.2467, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 4.288597583770752, | |
| "learning_rate": 9.04924623115578e-06, | |
| "loss": 1.248, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 5.981525897979736, | |
| "learning_rate": 9.029145728643218e-06, | |
| "loss": 1.2344, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 4.837640762329102, | |
| "learning_rate": 9.009045226130654e-06, | |
| "loss": 1.2305, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 5.082337856292725, | |
| "learning_rate": 8.988944723618092e-06, | |
| "loss": 1.2199, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 5.879444599151611, | |
| "learning_rate": 8.968844221105528e-06, | |
| "loss": 1.2158, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 4.926747798919678, | |
| "learning_rate": 8.948944723618092e-06, | |
| "loss": 1.2575, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 4.744166851043701, | |
| "learning_rate": 8.928844221105529e-06, | |
| "loss": 1.2325, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 5.994776725769043, | |
| "learning_rate": 8.908743718592967e-06, | |
| "loss": 1.2199, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 4.0552215576171875, | |
| "learning_rate": 8.888643216080403e-06, | |
| "loss": 1.2214, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 6.152566432952881, | |
| "learning_rate": 8.868542713567841e-06, | |
| "loss": 1.2565, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.718895435333252, | |
| "learning_rate": 8.848442211055277e-06, | |
| "loss": 1.2452, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 6.39285135269165, | |
| "learning_rate": 8.828341708542715e-06, | |
| "loss": 1.2, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 6.312516689300537, | |
| "learning_rate": 8.808241206030151e-06, | |
| "loss": 1.2739, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 5.090448379516602, | |
| "learning_rate": 8.78814070351759e-06, | |
| "loss": 1.2405, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 3.6773719787597656, | |
| "learning_rate": 8.768040201005026e-06, | |
| "loss": 1.2231, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 3.831404209136963, | |
| "learning_rate": 8.747939698492462e-06, | |
| "loss": 1.2068, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 6.460518836975098, | |
| "learning_rate": 8.7278391959799e-06, | |
| "loss": 1.2585, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 5.495750427246094, | |
| "learning_rate": 8.707738693467336e-06, | |
| "loss": 1.2489, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 3.785914421081543, | |
| "learning_rate": 8.687638190954774e-06, | |
| "loss": 1.1986, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 3.443301200866699, | |
| "learning_rate": 8.667537688442212e-06, | |
| "loss": 1.233, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 7.463976860046387, | |
| "learning_rate": 8.647437185929648e-06, | |
| "loss": 1.214, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 5.533621788024902, | |
| "learning_rate": 8.627336683417086e-06, | |
| "loss": 1.2116, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 5.238498210906982, | |
| "learning_rate": 8.607236180904524e-06, | |
| "loss": 1.1792, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 5.2816996574401855, | |
| "learning_rate": 8.58713567839196e-06, | |
| "loss": 1.2359, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 4.712213516235352, | |
| "learning_rate": 8.567035175879399e-06, | |
| "loss": 1.1824, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 3.9056365489959717, | |
| "learning_rate": 8.546934673366835e-06, | |
| "loss": 1.2206, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 6.587601184844971, | |
| "learning_rate": 8.527035175879397e-06, | |
| "loss": 1.2542, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 4.349347114562988, | |
| "learning_rate": 8.506934673366835e-06, | |
| "loss": 1.2516, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 4.775893211364746, | |
| "learning_rate": 8.486834170854272e-06, | |
| "loss": 1.18, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 4.952343940734863, | |
| "learning_rate": 8.46673366834171e-06, | |
| "loss": 1.2287, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 5.0424089431762695, | |
| "learning_rate": 8.446834170854272e-06, | |
| "loss": 1.2044, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 5.429243564605713, | |
| "learning_rate": 8.426733668341708e-06, | |
| "loss": 1.1936, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 4.514014720916748, | |
| "learning_rate": 8.406633165829146e-06, | |
| "loss": 1.2417, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 4.364452362060547, | |
| "learning_rate": 8.386532663316584e-06, | |
| "loss": 1.2004, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 3.5190353393554688, | |
| "learning_rate": 8.36643216080402e-06, | |
| "loss": 1.2144, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 5.794633865356445, | |
| "learning_rate": 8.346331658291458e-06, | |
| "loss": 1.2072, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 4.060710430145264, | |
| "learning_rate": 8.326231155778895e-06, | |
| "loss": 1.225, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 4.3035664558410645, | |
| "learning_rate": 8.306130653266333e-06, | |
| "loss": 1.1935, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 4.8658246994018555, | |
| "learning_rate": 8.28603015075377e-06, | |
| "loss": 1.2316, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 3.5524916648864746, | |
| "learning_rate": 8.265929648241207e-06, | |
| "loss": 1.2119, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 3.966935157775879, | |
| "learning_rate": 8.245829145728645e-06, | |
| "loss": 1.2427, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 5.453131675720215, | |
| "learning_rate": 8.225728643216081e-06, | |
| "loss": 1.1948, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 6.029975414276123, | |
| "learning_rate": 8.20562814070352e-06, | |
| "loss": 1.1688, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 3.311718225479126, | |
| "learning_rate": 8.185527638190955e-06, | |
| "loss": 1.2195, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 3.63813853263855, | |
| "learning_rate": 8.165427135678393e-06, | |
| "loss": 1.2288, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 4.470839500427246, | |
| "learning_rate": 8.14532663316583e-06, | |
| "loss": 1.2273, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 4.462855815887451, | |
| "learning_rate": 8.125226130653266e-06, | |
| "loss": 1.2066, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 4.757040023803711, | |
| "learning_rate": 8.105125628140704e-06, | |
| "loss": 1.2159, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 5.637049674987793, | |
| "learning_rate": 8.08502512562814e-06, | |
| "loss": 1.2449, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 5.578622341156006, | |
| "learning_rate": 8.064924623115578e-06, | |
| "loss": 1.2401, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 6.322601318359375, | |
| "learning_rate": 8.044824120603014e-06, | |
| "loss": 1.2546, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.232736587524414, | |
| "eval_runtime": 21.8587, | |
| "eval_samples_per_second": 45.748, | |
| "eval_steps_per_second": 5.719, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 4.387004375457764, | |
| "learning_rate": 8.024723618090452e-06, | |
| "loss": 1.1591, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 3.7655532360076904, | |
| "learning_rate": 8.00462311557789e-06, | |
| "loss": 1.2148, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 4.917843341827393, | |
| "learning_rate": 7.984522613065327e-06, | |
| "loss": 1.1924, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 4.71078634262085, | |
| "learning_rate": 7.964422110552765e-06, | |
| "loss": 1.1649, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 6.106967449188232, | |
| "learning_rate": 7.944321608040203e-06, | |
| "loss": 1.2001, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 5.224365711212158, | |
| "learning_rate": 7.924221105527639e-06, | |
| "loss": 1.1912, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 6.110058784484863, | |
| "learning_rate": 7.904120603015077e-06, | |
| "loss": 1.2004, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 4.606750965118408, | |
| "learning_rate": 7.884020100502513e-06, | |
| "loss": 1.1824, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 4.328644275665283, | |
| "learning_rate": 7.863919597989951e-06, | |
| "loss": 1.1818, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 5.017879009246826, | |
| "learning_rate": 7.843819095477387e-06, | |
| "loss": 1.1843, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 6.072721481323242, | |
| "learning_rate": 7.823718592964825e-06, | |
| "loss": 1.1876, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 5.169823169708252, | |
| "learning_rate": 7.803618090452262e-06, | |
| "loss": 1.1762, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 5.349250793457031, | |
| "learning_rate": 7.7835175879397e-06, | |
| "loss": 1.1541, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 5.824612140655518, | |
| "learning_rate": 7.763417085427136e-06, | |
| "loss": 1.1696, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 6.2018938064575195, | |
| "learning_rate": 7.743316582914574e-06, | |
| "loss": 1.1621, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 4.709869384765625, | |
| "learning_rate": 7.72321608040201e-06, | |
| "loss": 1.1777, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 4.259114742279053, | |
| "learning_rate": 7.703115577889448e-06, | |
| "loss": 1.1883, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 5.505044460296631, | |
| "learning_rate": 7.683015075376884e-06, | |
| "loss": 1.1896, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 5.121050834655762, | |
| "learning_rate": 7.662914572864322e-06, | |
| "loss": 1.1533, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 3.265988349914551, | |
| "learning_rate": 7.642814070351759e-06, | |
| "loss": 1.1746, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 4.327176094055176, | |
| "learning_rate": 7.622713567839196e-06, | |
| "loss": 1.2136, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 6.783113479614258, | |
| "learning_rate": 7.602613065326634e-06, | |
| "loss": 1.1538, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 6.27109956741333, | |
| "learning_rate": 7.582512562814071e-06, | |
| "loss": 1.1995, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 6.903465270996094, | |
| "learning_rate": 7.562412060301508e-06, | |
| "loss": 1.1468, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 4.696254253387451, | |
| "learning_rate": 7.542311557788945e-06, | |
| "loss": 1.199, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 4.280835151672363, | |
| "learning_rate": 7.522211055276382e-06, | |
| "loss": 1.1616, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 5.0677924156188965, | |
| "learning_rate": 7.5021105527638195e-06, | |
| "loss": 1.1575, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 6.276374816894531, | |
| "learning_rate": 7.4820100502512574e-06, | |
| "loss": 1.137, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 4.777525424957275, | |
| "learning_rate": 7.461909547738694e-06, | |
| "loss": 1.201, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 4.285521030426025, | |
| "learning_rate": 7.441809045226132e-06, | |
| "loss": 1.1924, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 6.18180513381958, | |
| "learning_rate": 7.421708542713568e-06, | |
| "loss": 1.1347, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 6.091145038604736, | |
| "learning_rate": 7.401608040201006e-06, | |
| "loss": 1.1795, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 5.018629550933838, | |
| "learning_rate": 7.381507537688442e-06, | |
| "loss": 1.1755, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 5.406840801239014, | |
| "learning_rate": 7.36140703517588e-06, | |
| "loss": 1.1869, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 4.727605819702148, | |
| "learning_rate": 7.341306532663317e-06, | |
| "loss": 1.1772, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 5.590334892272949, | |
| "learning_rate": 7.3212060301507544e-06, | |
| "loss": 1.1686, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 5.156420707702637, | |
| "learning_rate": 7.3011055276381916e-06, | |
| "loss": 1.1696, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 6.83641242980957, | |
| "learning_rate": 7.2810050251256296e-06, | |
| "loss": 1.1732, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 4.086230278015137, | |
| "learning_rate": 7.260904522613066e-06, | |
| "loss": 1.1536, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 7.394796371459961, | |
| "learning_rate": 7.241005025125629e-06, | |
| "loss": 1.1636, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 6.290234088897705, | |
| "learning_rate": 7.220904522613066e-06, | |
| "loss": 1.1486, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 5.5817036628723145, | |
| "learning_rate": 7.200804020100503e-06, | |
| "loss": 1.1424, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 5.681445598602295, | |
| "learning_rate": 7.1807035175879405e-06, | |
| "loss": 1.1591, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 6.768691062927246, | |
| "learning_rate": 7.160603015075377e-06, | |
| "loss": 1.1293, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 5.4178595542907715, | |
| "learning_rate": 7.140502512562815e-06, | |
| "loss": 1.1729, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 4.215164661407471, | |
| "learning_rate": 7.120402010050251e-06, | |
| "loss": 1.1783, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 5.66365385055542, | |
| "learning_rate": 7.100301507537689e-06, | |
| "loss": 1.1967, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 5.554622650146484, | |
| "learning_rate": 7.080201005025126e-06, | |
| "loss": 1.1466, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 5.001458644866943, | |
| "learning_rate": 7.060100502512563e-06, | |
| "loss": 1.1732, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.8027803897857666, | |
| "learning_rate": 7.04e-06, | |
| "loss": 1.152, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 4.733661651611328, | |
| "learning_rate": 7.019899497487438e-06, | |
| "loss": 1.1155, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 7.268320083618164, | |
| "learning_rate": 6.999798994974875e-06, | |
| "loss": 1.1703, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 6.752691745758057, | |
| "learning_rate": 6.979698492462313e-06, | |
| "loss": 1.1986, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 5.244182586669922, | |
| "learning_rate": 6.959597989949749e-06, | |
| "loss": 1.148, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 5.481043815612793, | |
| "learning_rate": 6.939497487437187e-06, | |
| "loss": 1.1811, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 5.892518997192383, | |
| "learning_rate": 6.919396984924623e-06, | |
| "loss": 1.1574, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 5.347742557525635, | |
| "learning_rate": 6.899296482412061e-06, | |
| "loss": 1.1353, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 5.078448295593262, | |
| "learning_rate": 6.879195979899498e-06, | |
| "loss": 1.1398, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 4.15362548828125, | |
| "learning_rate": 6.859095477386935e-06, | |
| "loss": 1.1556, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 4.748194694519043, | |
| "learning_rate": 6.8389949748743725e-06, | |
| "loss": 1.1929, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 5.56561803817749, | |
| "learning_rate": 6.81889447236181e-06, | |
| "loss": 1.1907, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 4.8242316246032715, | |
| "learning_rate": 6.798793969849247e-06, | |
| "loss": 1.166, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 5.7045087814331055, | |
| "learning_rate": 6.778693467336685e-06, | |
| "loss": 1.1579, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 4.583883285522461, | |
| "learning_rate": 6.758592964824121e-06, | |
| "loss": 1.1259, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 5.085745811462402, | |
| "learning_rate": 6.738492462311559e-06, | |
| "loss": 1.1431, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 4.655329704284668, | |
| "learning_rate": 6.718391959798995e-06, | |
| "loss": 1.1681, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 4.375367164611816, | |
| "learning_rate": 6.698291457286433e-06, | |
| "loss": 1.1428, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 5.143255710601807, | |
| "learning_rate": 6.6781909547738695e-06, | |
| "loss": 1.1676, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 5.463631629943848, | |
| "learning_rate": 6.658090452261307e-06, | |
| "loss": 1.1742, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 5.112860679626465, | |
| "learning_rate": 6.637989949748745e-06, | |
| "loss": 1.1655, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 3.88566517829895, | |
| "learning_rate": 6.617889447236181e-06, | |
| "loss": 1.1428, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 5.075991153717041, | |
| "learning_rate": 6.597788944723619e-06, | |
| "loss": 1.1853, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 6.4206061363220215, | |
| "learning_rate": 6.577688442211055e-06, | |
| "loss": 1.1622, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 4.789801597595215, | |
| "learning_rate": 6.557587939698493e-06, | |
| "loss": 1.2021, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 4.722198486328125, | |
| "learning_rate": 6.53748743718593e-06, | |
| "loss": 1.1515, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 4.774144649505615, | |
| "learning_rate": 6.517386934673367e-06, | |
| "loss": 1.1429, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.848876953125, | |
| "learning_rate": 6.49748743718593e-06, | |
| "loss": 1.1719, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 6.2731804847717285, | |
| "learning_rate": 6.477386934673368e-06, | |
| "loss": 1.1513, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 6.135923385620117, | |
| "learning_rate": 6.457286432160804e-06, | |
| "loss": 1.1758, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 5.437047958374023, | |
| "learning_rate": 6.437185929648242e-06, | |
| "loss": 1.174, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 5.043646335601807, | |
| "learning_rate": 6.417085427135678e-06, | |
| "loss": 1.171, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 4.104462623596191, | |
| "learning_rate": 6.396984924623116e-06, | |
| "loss": 1.1712, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 2.740678310394287, | |
| "learning_rate": 6.376884422110553e-06, | |
| "loss": 1.1127, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 5.4752936363220215, | |
| "learning_rate": 6.3567839195979905e-06, | |
| "loss": 1.1971, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 7.34414529800415, | |
| "learning_rate": 6.336683417085428e-06, | |
| "loss": 1.1714, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 3.3866333961486816, | |
| "learning_rate": 6.316582914572866e-06, | |
| "loss": 1.1604, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 5.284789085388184, | |
| "learning_rate": 6.296482412060302e-06, | |
| "loss": 1.1792, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 4.47866678237915, | |
| "learning_rate": 6.27638190954774e-06, | |
| "loss": 1.1343, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 6.508190631866455, | |
| "learning_rate": 6.256281407035176e-06, | |
| "loss": 1.1286, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 5.973139762878418, | |
| "learning_rate": 6.236180904522614e-06, | |
| "loss": 1.1774, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 4.717242240905762, | |
| "learning_rate": 6.21608040201005e-06, | |
| "loss": 1.1273, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 5.430871486663818, | |
| "learning_rate": 6.195979899497488e-06, | |
| "loss": 1.1764, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 4.484432697296143, | |
| "learning_rate": 6.1758793969849255e-06, | |
| "loss": 1.154, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 4.041011333465576, | |
| "learning_rate": 6.155778894472362e-06, | |
| "loss": 1.1614, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 4.026901721954346, | |
| "learning_rate": 6.1356783919598e-06, | |
| "loss": 1.1376, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 6.2372660636901855, | |
| "learning_rate": 6.115577889447236e-06, | |
| "loss": 1.1556, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 5.324029445648193, | |
| "learning_rate": 6.095477386934674e-06, | |
| "loss": 1.1533, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 7.105170726776123, | |
| "learning_rate": 6.07537688442211e-06, | |
| "loss": 1.1895, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 23.556692123413086, | |
| "learning_rate": 6.055276381909548e-06, | |
| "loss": 1.1569, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 5.792446613311768, | |
| "learning_rate": 6.035175879396985e-06, | |
| "loss": 1.1488, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 1.2055375576019287, | |
| "eval_runtime": 21.7807, | |
| "eval_samples_per_second": 45.912, | |
| "eval_steps_per_second": 5.739, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 5.223493576049805, | |
| "learning_rate": 6.0150753768844225e-06, | |
| "loss": 1.144, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 4.998097896575928, | |
| "learning_rate": 5.99497487437186e-06, | |
| "loss": 1.154, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 6.396411895751953, | |
| "learning_rate": 5.975075376884423e-06, | |
| "loss": 1.1398, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 4.5480427742004395, | |
| "learning_rate": 5.954974874371859e-06, | |
| "loss": 1.1753, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 5.8266167640686035, | |
| "learning_rate": 5.934874371859297e-06, | |
| "loss": 1.1481, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 3.3341739177703857, | |
| "learning_rate": 5.9147738693467334e-06, | |
| "loss": 1.1801, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 5.731110572814941, | |
| "learning_rate": 5.894673366834171e-06, | |
| "loss": 1.1945, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 5.0252251625061035, | |
| "learning_rate": 5.8745728643216085e-06, | |
| "loss": 1.1171, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 6.497035503387451, | |
| "learning_rate": 5.854472361809046e-06, | |
| "loss": 1.1299, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 3.014439582824707, | |
| "learning_rate": 5.834371859296483e-06, | |
| "loss": 1.1135, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 6.145033359527588, | |
| "learning_rate": 5.814271356783921e-06, | |
| "loss": 1.1544, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 4.730653285980225, | |
| "learning_rate": 5.794170854271357e-06, | |
| "loss": 1.1501, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 4.570452690124512, | |
| "learning_rate": 5.774070351758795e-06, | |
| "loss": 1.1638, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 3.947618007659912, | |
| "learning_rate": 5.753969849246231e-06, | |
| "loss": 1.1436, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 6.833681106567383, | |
| "learning_rate": 5.733869346733669e-06, | |
| "loss": 1.1271, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 4.837987422943115, | |
| "learning_rate": 5.7137688442211056e-06, | |
| "loss": 1.1503, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 4.822892189025879, | |
| "learning_rate": 5.6936683417085435e-06, | |
| "loss": 1.1089, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 5.022984027862549, | |
| "learning_rate": 5.673567839195981e-06, | |
| "loss": 1.1562, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 5.105147838592529, | |
| "learning_rate": 5.653467336683418e-06, | |
| "loss": 1.1174, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 4.37985372543335, | |
| "learning_rate": 5.633366834170855e-06, | |
| "loss": 1.1627, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 3.854820966720581, | |
| "learning_rate": 5.613266331658291e-06, | |
| "loss": 1.1316, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 7.305357933044434, | |
| "learning_rate": 5.593165829145729e-06, | |
| "loss": 1.1827, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 7.693294048309326, | |
| "learning_rate": 5.5730653266331654e-06, | |
| "loss": 1.1649, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 4.14479398727417, | |
| "learning_rate": 5.552964824120603e-06, | |
| "loss": 1.1844, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 6.665209770202637, | |
| "learning_rate": 5.5328643216080405e-06, | |
| "loss": 1.1304, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 4.629899978637695, | |
| "learning_rate": 5.512763819095478e-06, | |
| "loss": 1.1318, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 3.4445009231567383, | |
| "learning_rate": 5.492663316582915e-06, | |
| "loss": 1.1532, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 3.96806001663208, | |
| "learning_rate": 5.472562814070353e-06, | |
| "loss": 1.1332, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 7.256198406219482, | |
| "learning_rate": 5.452462311557789e-06, | |
| "loss": 1.1339, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 4.4425458908081055, | |
| "learning_rate": 5.432361809045227e-06, | |
| "loss": 1.1615, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 6.114246368408203, | |
| "learning_rate": 5.412261306532663e-06, | |
| "loss": 1.1359, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 3.9182209968566895, | |
| "learning_rate": 5.392160804020101e-06, | |
| "loss": 1.1574, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 6.088989734649658, | |
| "learning_rate": 5.3720603015075376e-06, | |
| "loss": 1.0938, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 6.2887163162231445, | |
| "learning_rate": 5.3519597989949755e-06, | |
| "loss": 1.1546, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 5.033719539642334, | |
| "learning_rate": 5.331859296482413e-06, | |
| "loss": 1.1177, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 6.611480236053467, | |
| "learning_rate": 5.31175879396985e-06, | |
| "loss": 1.1238, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 7.612136363983154, | |
| "learning_rate": 5.291658291457287e-06, | |
| "loss": 1.1494, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 3.842085599899292, | |
| "learning_rate": 5.271557788944725e-06, | |
| "loss": 1.1585, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 4.859694004058838, | |
| "learning_rate": 5.251457286432161e-06, | |
| "loss": 1.1299, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 6.1673808097839355, | |
| "learning_rate": 5.231356783919599e-06, | |
| "loss": 1.1242, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 4.361232280731201, | |
| "learning_rate": 5.2112562814070354e-06, | |
| "loss": 1.1545, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 5.18151330947876, | |
| "learning_rate": 5.191155778894473e-06, | |
| "loss": 1.0794, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 5.049399375915527, | |
| "learning_rate": 5.17105527638191e-06, | |
| "loss": 1.136, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 4.28516149520874, | |
| "learning_rate": 5.150954773869347e-06, | |
| "loss": 1.177, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 3.71186900138855, | |
| "learning_rate": 5.13105527638191e-06, | |
| "loss": 1.1747, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 8.269830703735352, | |
| "learning_rate": 5.110954773869348e-06, | |
| "loss": 1.1494, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 4.453472137451172, | |
| "learning_rate": 5.09105527638191e-06, | |
| "loss": 1.1511, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 6.132487773895264, | |
| "learning_rate": 5.070954773869348e-06, | |
| "loss": 1.1604, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.084425926208496, | |
| "learning_rate": 5.050854271356785e-06, | |
| "loss": 1.1582, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.719122886657715, | |
| "learning_rate": 5.030753768844222e-06, | |
| "loss": 1.0998, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 6.886116981506348, | |
| "learning_rate": 5.010854271356784e-06, | |
| "loss": 1.1402, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 3.9882471561431885, | |
| "learning_rate": 4.990753768844221e-06, | |
| "loss": 1.1163, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 7.415084362030029, | |
| "learning_rate": 4.9706532663316585e-06, | |
| "loss": 1.1145, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 4.16004753112793, | |
| "learning_rate": 4.950552763819096e-06, | |
| "loss": 1.1071, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 5.258670806884766, | |
| "learning_rate": 4.930452261306533e-06, | |
| "loss": 1.1242, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 5.320291519165039, | |
| "learning_rate": 4.910351758793971e-06, | |
| "loss": 1.1388, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 4.200166702270508, | |
| "learning_rate": 4.890251256281408e-06, | |
| "loss": 1.1395, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 4.569030284881592, | |
| "learning_rate": 4.870150753768845e-06, | |
| "loss": 1.1397, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 4.229086875915527, | |
| "learning_rate": 4.850050251256282e-06, | |
| "loss": 1.1705, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 5.543234825134277, | |
| "learning_rate": 4.829949748743719e-06, | |
| "loss": 1.1403, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 4.94819974899292, | |
| "learning_rate": 4.809849246231156e-06, | |
| "loss": 1.1395, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 3.889681577682495, | |
| "learning_rate": 4.7897487437185935e-06, | |
| "loss": 1.137, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 5.113078594207764, | |
| "learning_rate": 4.769648241206031e-06, | |
| "loss": 1.1171, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 4.799468994140625, | |
| "learning_rate": 4.749547738693468e-06, | |
| "loss": 1.1235, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 4.0734453201293945, | |
| "learning_rate": 4.729447236180905e-06, | |
| "loss": 1.1283, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 4.847392559051514, | |
| "learning_rate": 4.709346733668342e-06, | |
| "loss": 1.1932, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 6.197125434875488, | |
| "learning_rate": 4.689246231155779e-06, | |
| "loss": 1.144, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 5.93238639831543, | |
| "learning_rate": 4.669145728643216e-06, | |
| "loss": 1.0989, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 5.202603340148926, | |
| "learning_rate": 4.649045226130653e-06, | |
| "loss": 1.1312, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 5.290008544921875, | |
| "learning_rate": 4.6289447236180905e-06, | |
| "loss": 1.096, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 4.39517879486084, | |
| "learning_rate": 4.608844221105528e-06, | |
| "loss": 1.1948, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 6.785552024841309, | |
| "learning_rate": 4.588743718592965e-06, | |
| "loss": 1.139, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 5.2934370040893555, | |
| "learning_rate": 4.568643216080402e-06, | |
| "loss": 1.1152, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 5.2376604080200195, | |
| "learning_rate": 4.54854271356784e-06, | |
| "loss": 1.1464, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 6.19007682800293, | |
| "learning_rate": 4.528442211055277e-06, | |
| "loss": 1.1199, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 5.796671390533447, | |
| "learning_rate": 4.508341708542714e-06, | |
| "loss": 1.1399, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 5.685388565063477, | |
| "learning_rate": 4.488241206030151e-06, | |
| "loss": 1.1485, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 6.502816677093506, | |
| "learning_rate": 4.468140703517588e-06, | |
| "loss": 1.15, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 4.437497138977051, | |
| "learning_rate": 4.4480402010050255e-06, | |
| "loss": 1.1344, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 6.776554107666016, | |
| "learning_rate": 4.427939698492463e-06, | |
| "loss": 1.1486, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 3.9491071701049805, | |
| "learning_rate": 4.4078391959799e-06, | |
| "loss": 1.1269, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 5.485203266143799, | |
| "learning_rate": 4.387738693467337e-06, | |
| "loss": 1.1539, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 7.40858793258667, | |
| "learning_rate": 4.367638190954774e-06, | |
| "loss": 1.1281, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 4.498636722564697, | |
| "learning_rate": 4.347537688442212e-06, | |
| "loss": 1.1912, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.4472856521606445, | |
| "learning_rate": 4.327437185929649e-06, | |
| "loss": 1.1276, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 6.126656532287598, | |
| "learning_rate": 4.307336683417086e-06, | |
| "loss": 1.1329, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 5.280217170715332, | |
| "learning_rate": 4.287236180904523e-06, | |
| "loss": 1.093, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 6.618311405181885, | |
| "learning_rate": 4.2671356783919605e-06, | |
| "loss": 1.1215, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 5.225731372833252, | |
| "learning_rate": 4.247035175879397e-06, | |
| "loss": 1.1252, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 6.615197658538818, | |
| "learning_rate": 4.226934673366834e-06, | |
| "loss": 1.0734, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 5.426534175872803, | |
| "learning_rate": 4.206834170854272e-06, | |
| "loss": 1.0851, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 3.9240429401397705, | |
| "learning_rate": 4.186733668341709e-06, | |
| "loss": 1.1033, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 4.4714250564575195, | |
| "learning_rate": 4.166633165829146e-06, | |
| "loss": 1.1347, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 10.831876754760742, | |
| "learning_rate": 4.146532663316583e-06, | |
| "loss": 1.0971, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 5.725513458251953, | |
| "learning_rate": 4.12643216080402e-06, | |
| "loss": 1.1346, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 6.024409770965576, | |
| "learning_rate": 4.1063316582914575e-06, | |
| "loss": 1.1551, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 3.672581195831299, | |
| "learning_rate": 4.086231155778895e-06, | |
| "loss": 1.1365, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 5.520185470581055, | |
| "learning_rate": 4.066130653266332e-06, | |
| "loss": 1.1321, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 6.262924671173096, | |
| "learning_rate": 4.046030150753769e-06, | |
| "loss": 1.1062, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 6.248514652252197, | |
| "learning_rate": 4.025929648241206e-06, | |
| "loss": 1.0963, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.1962475776672363, | |
| "eval_runtime": 21.7746, | |
| "eval_samples_per_second": 45.925, | |
| "eval_steps_per_second": 5.741, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 4.919951915740967, | |
| "learning_rate": 4.005829145728643e-06, | |
| "loss": 1.0946, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 4.365044593811035, | |
| "learning_rate": 3.985728643216081e-06, | |
| "loss": 1.0987, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 9.315962791442871, | |
| "learning_rate": 3.965628140703518e-06, | |
| "loss": 1.0575, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": 10.50012493133545, | |
| "learning_rate": 3.945527638190955e-06, | |
| "loss": 1.069, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 8.853384017944336, | |
| "learning_rate": 3.9254271356783925e-06, | |
| "loss": 1.0638, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 5.290276527404785, | |
| "learning_rate": 3.90532663316583e-06, | |
| "loss": 1.0422, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 5.553296089172363, | |
| "learning_rate": 3.885226130653267e-06, | |
| "loss": 1.0517, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 5.262092113494873, | |
| "learning_rate": 3.865125628140704e-06, | |
| "loss": 1.1184, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 8.444523811340332, | |
| "learning_rate": 3.845025125628141e-06, | |
| "loss": 1.0861, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 5.442756652832031, | |
| "learning_rate": 3.824924623115578e-06, | |
| "loss": 1.0559, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 5.926506519317627, | |
| "learning_rate": 3.8050251256281414e-06, | |
| "loss": 1.0482, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 5.596924781799316, | |
| "learning_rate": 3.7849246231155785e-06, | |
| "loss": 1.071, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 8.789709091186523, | |
| "learning_rate": 3.7648241206030156e-06, | |
| "loss": 1.0796, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 5.048920631408691, | |
| "learning_rate": 3.7447236180904528e-06, | |
| "loss": 1.0602, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 6.4080705642700195, | |
| "learning_rate": 3.72462311557789e-06, | |
| "loss": 1.0481, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 6.361392974853516, | |
| "learning_rate": 3.7045226130653266e-06, | |
| "loss": 1.0393, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 7.596906661987305, | |
| "learning_rate": 3.6844221105527637e-06, | |
| "loss": 1.0212, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 5.352015972137451, | |
| "learning_rate": 3.6643216080402013e-06, | |
| "loss": 1.0159, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 5.569393157958984, | |
| "learning_rate": 3.6442211055276384e-06, | |
| "loss": 1.0516, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 5.463687419891357, | |
| "learning_rate": 3.6241206030150755e-06, | |
| "loss": 1.0729, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 11.406126976013184, | |
| "learning_rate": 3.6040201005025127e-06, | |
| "loss": 1.0389, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 6.264597415924072, | |
| "learning_rate": 3.58391959798995e-06, | |
| "loss": 1.0613, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 6.288965225219727, | |
| "learning_rate": 3.5638190954773873e-06, | |
| "loss": 1.0486, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 5.622555732727051, | |
| "learning_rate": 3.5437185929648245e-06, | |
| "loss": 1.0665, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 4.834249973297119, | |
| "learning_rate": 3.5236180904522616e-06, | |
| "loss": 1.074, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 8.10172176361084, | |
| "learning_rate": 3.5035175879396987e-06, | |
| "loss": 1.0272, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 5.477063179016113, | |
| "learning_rate": 3.483417085427136e-06, | |
| "loss": 1.0405, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 4.51005220413208, | |
| "learning_rate": 3.463316582914573e-06, | |
| "loss": 1.0339, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 7.996946811676025, | |
| "learning_rate": 3.4432160804020105e-06, | |
| "loss": 1.0159, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 6.292382717132568, | |
| "learning_rate": 3.4231155778894477e-06, | |
| "loss": 1.085, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 8.48137092590332, | |
| "learning_rate": 3.4030150753768848e-06, | |
| "loss": 1.047, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 6.187898635864258, | |
| "learning_rate": 3.382914572864322e-06, | |
| "loss": 1.0577, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 4.8612775802612305, | |
| "learning_rate": 3.3630150753768847e-06, | |
| "loss": 1.0273, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 5.060107231140137, | |
| "learning_rate": 3.342914572864322e-06, | |
| "loss": 1.0998, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 4.8297505378723145, | |
| "learning_rate": 3.322814070351759e-06, | |
| "loss": 1.0821, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 4.36783504486084, | |
| "learning_rate": 3.3027135678391965e-06, | |
| "loss": 1.0591, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 7.225709438323975, | |
| "learning_rate": 3.2826130653266337e-06, | |
| "loss": 1.0245, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 5.521580219268799, | |
| "learning_rate": 3.262512562814071e-06, | |
| "loss": 1.0131, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 4.925292491912842, | |
| "learning_rate": 3.242412060301508e-06, | |
| "loss": 1.023, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 5.887296676635742, | |
| "learning_rate": 3.222311557788945e-06, | |
| "loss": 1.0233, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 8.528314590454102, | |
| "learning_rate": 3.2022110552763826e-06, | |
| "loss": 1.0593, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 5.414628982543945, | |
| "learning_rate": 3.182110552763819e-06, | |
| "loss": 1.0526, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 8.320696830749512, | |
| "learning_rate": 3.1620100502512564e-06, | |
| "loss": 1.061, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 7.6390767097473145, | |
| "learning_rate": 3.1419095477386936e-06, | |
| "loss": 1.0789, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 6.141530990600586, | |
| "learning_rate": 3.1218090452261307e-06, | |
| "loss": 1.0438, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 5.907855987548828, | |
| "learning_rate": 3.101708542713568e-06, | |
| "loss": 1.068, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 8.13024616241455, | |
| "learning_rate": 3.081608040201005e-06, | |
| "loss": 1.0612, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 4.439888954162598, | |
| "learning_rate": 3.0615075376884425e-06, | |
| "loss": 1.0589, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 5.636837005615234, | |
| "learning_rate": 3.0414070351758796e-06, | |
| "loss": 1.0306, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 4.6407389640808105, | |
| "learning_rate": 3.0213065326633168e-06, | |
| "loss": 1.0509, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 5.515079975128174, | |
| "learning_rate": 3.001206030150754e-06, | |
| "loss": 1.0273, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 5.22351598739624, | |
| "learning_rate": 2.981105527638191e-06, | |
| "loss": 1.0579, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 6.4437479972839355, | |
| "learning_rate": 2.961206030150754e-06, | |
| "loss": 1.0435, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 6.850719928741455, | |
| "learning_rate": 2.941105527638191e-06, | |
| "loss": 1.0487, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 4.1141276359558105, | |
| "learning_rate": 2.9210050251256285e-06, | |
| "loss": 1.0124, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 6.114626884460449, | |
| "learning_rate": 2.9009045226130656e-06, | |
| "loss": 1.0146, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 5.943951606750488, | |
| "learning_rate": 2.8808040201005028e-06, | |
| "loss": 1.0484, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 6.745482444763184, | |
| "learning_rate": 2.86070351758794e-06, | |
| "loss": 1.0353, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 6.165544033050537, | |
| "learning_rate": 2.840603015075377e-06, | |
| "loss": 1.0572, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 5.070137023925781, | |
| "learning_rate": 2.8205025125628146e-06, | |
| "loss": 1.0607, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 5.177759647369385, | |
| "learning_rate": 2.8004020100502517e-06, | |
| "loss": 1.0589, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 5.926203727722168, | |
| "learning_rate": 2.780301507537689e-06, | |
| "loss": 1.0728, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 4.766726970672607, | |
| "learning_rate": 2.760201005025126e-06, | |
| "loss": 1.066, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 7.0791401863098145, | |
| "learning_rate": 2.740100502512563e-06, | |
| "loss": 1.0381, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 6.904074668884277, | |
| "learning_rate": 2.7200000000000002e-06, | |
| "loss": 1.0341, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 7.680102348327637, | |
| "learning_rate": 2.700100502512563e-06, | |
| "loss": 1.0288, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 7.589695930480957, | |
| "learning_rate": 2.68e-06, | |
| "loss": 1.0677, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 4.686061382293701, | |
| "learning_rate": 2.6598994974874377e-06, | |
| "loss": 1.0213, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 5.1108269691467285, | |
| "learning_rate": 2.639798994974875e-06, | |
| "loss": 1.0501, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 6.551150798797607, | |
| "learning_rate": 2.619698492462312e-06, | |
| "loss": 1.0598, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 6.834580898284912, | |
| "learning_rate": 2.599597989949749e-06, | |
| "loss": 1.0757, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 4.995261192321777, | |
| "learning_rate": 2.579497487437186e-06, | |
| "loss": 1.0653, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 5.614989757537842, | |
| "learning_rate": 2.559396984924623e-06, | |
| "loss": 1.0412, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 12.592680931091309, | |
| "learning_rate": 2.5392964824120605e-06, | |
| "loss": 1.1065, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 7.026024341583252, | |
| "learning_rate": 2.5191959798994976e-06, | |
| "loss": 1.064, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 6.075479507446289, | |
| "learning_rate": 2.499095477386935e-06, | |
| "loss": 1.0559, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 7.670622825622559, | |
| "learning_rate": 2.4789949748743723e-06, | |
| "loss": 1.0173, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 6.775985240936279, | |
| "learning_rate": 2.458894472361809e-06, | |
| "loss": 1.0192, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 6.7632222175598145, | |
| "learning_rate": 2.438793969849246e-06, | |
| "loss": 1.0403, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 6.712850570678711, | |
| "learning_rate": 2.4186934673366837e-06, | |
| "loss": 1.0408, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 4.842583656311035, | |
| "learning_rate": 2.398592964824121e-06, | |
| "loss": 1.0711, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 6.134751796722412, | |
| "learning_rate": 2.378492462311558e-06, | |
| "loss": 1.0085, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 5.06552791595459, | |
| "learning_rate": 2.358391959798995e-06, | |
| "loss": 1.0088, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 6.872971057891846, | |
| "learning_rate": 2.338291457286432e-06, | |
| "loss": 1.0492, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 5.2886528968811035, | |
| "learning_rate": 2.3181909547738697e-06, | |
| "loss": 1.0699, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 7.221102237701416, | |
| "learning_rate": 2.298090452261307e-06, | |
| "loss": 1.0505, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 7.388364791870117, | |
| "learning_rate": 2.277989949748744e-06, | |
| "loss": 1.0127, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 11.533583641052246, | |
| "learning_rate": 2.257889447236181e-06, | |
| "loss": 1.055, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 5.406151294708252, | |
| "learning_rate": 2.2377889447236182e-06, | |
| "loss": 1.0848, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 8.093517303466797, | |
| "learning_rate": 2.2176884422110554e-06, | |
| "loss": 1.0058, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 7.073362827301025, | |
| "learning_rate": 2.1975879396984925e-06, | |
| "loss": 1.0233, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 6.263842582702637, | |
| "learning_rate": 2.1776884422110558e-06, | |
| "loss": 1.0259, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 4.668169975280762, | |
| "learning_rate": 2.1575879396984925e-06, | |
| "loss": 1.0641, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 5.550297737121582, | |
| "learning_rate": 2.1374874371859296e-06, | |
| "loss": 1.0768, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 6.35700798034668, | |
| "learning_rate": 2.1173869346733667e-06, | |
| "loss": 1.0237, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 8.346928596496582, | |
| "learning_rate": 2.0972864321608043e-06, | |
| "loss": 1.0265, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 5.672070026397705, | |
| "learning_rate": 2.0771859296482414e-06, | |
| "loss": 1.0309, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 7.057281494140625, | |
| "learning_rate": 2.0570854271356785e-06, | |
| "loss": 1.0131, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 5.305558204650879, | |
| "learning_rate": 2.0369849246231156e-06, | |
| "loss": 1.075, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 7.687346935272217, | |
| "learning_rate": 2.0168844221105528e-06, | |
| "loss": 1.0525, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 1.240631341934204, | |
| "eval_runtime": 21.7808, | |
| "eval_samples_per_second": 45.912, | |
| "eval_steps_per_second": 5.739, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 4.474082946777344, | |
| "learning_rate": 1.9967839195979903e-06, | |
| "loss": 1.0245, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 7.113109588623047, | |
| "learning_rate": 1.9766834170854275e-06, | |
| "loss": 1.0475, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 5.945183753967285, | |
| "learning_rate": 1.9565829145728646e-06, | |
| "loss": 1.066, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 5.280254364013672, | |
| "learning_rate": 1.9364824120603017e-06, | |
| "loss": 1.0428, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 4.7843098640441895, | |
| "learning_rate": 1.916381909547739e-06, | |
| "loss": 1.0468, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 6.668610572814941, | |
| "learning_rate": 1.896281407035176e-06, | |
| "loss": 1.0357, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 6.481570720672607, | |
| "learning_rate": 1.876180904522613e-06, | |
| "loss": 1.0502, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 6.947873115539551, | |
| "learning_rate": 1.8560804020100504e-06, | |
| "loss": 1.0129, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 5.446567058563232, | |
| "learning_rate": 1.8359798994974876e-06, | |
| "loss": 1.0222, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 6.875363349914551, | |
| "learning_rate": 1.8158793969849247e-06, | |
| "loss": 1.0444, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 5.002197742462158, | |
| "learning_rate": 1.795778894472362e-06, | |
| "loss": 1.0339, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 6.167435169219971, | |
| "learning_rate": 1.7756783919597991e-06, | |
| "loss": 0.9916, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 4.765905380249023, | |
| "learning_rate": 1.7555778894472365e-06, | |
| "loss": 1.0356, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 9.28632926940918, | |
| "learning_rate": 1.7354773869346736e-06, | |
| "loss": 1.052, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 5.864830017089844, | |
| "learning_rate": 1.7153768844221107e-06, | |
| "loss": 1.014, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 10.017616271972656, | |
| "learning_rate": 1.6952763819095477e-06, | |
| "loss": 1.0347, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 6.52495813369751, | |
| "learning_rate": 1.675175879396985e-06, | |
| "loss": 1.0094, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 5.506436347961426, | |
| "learning_rate": 1.6550753768844221e-06, | |
| "loss": 1.0306, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 6.378169059753418, | |
| "learning_rate": 1.6349748743718595e-06, | |
| "loss": 1.0481, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 7.866368293762207, | |
| "learning_rate": 1.6148743718592966e-06, | |
| "loss": 1.0433, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 3.3074705600738525, | |
| "learning_rate": 1.5947738693467337e-06, | |
| "loss": 1.0255, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 5.663514137268066, | |
| "learning_rate": 1.574673366834171e-06, | |
| "loss": 1.0066, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 5.797703266143799, | |
| "learning_rate": 1.5545728643216082e-06, | |
| "loss": 1.0223, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 5.430294513702393, | |
| "learning_rate": 1.5344723618090453e-06, | |
| "loss": 1.0361, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 7.778083324432373, | |
| "learning_rate": 1.5143718592964826e-06, | |
| "loss": 1.0128, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 8.762657165527344, | |
| "learning_rate": 1.4942713567839198e-06, | |
| "loss": 0.9764, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 6.608733654022217, | |
| "learning_rate": 1.4741708542713571e-06, | |
| "loss": 1.0355, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 4.251249313354492, | |
| "learning_rate": 1.454070351758794e-06, | |
| "loss": 1.0383, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 5.131290912628174, | |
| "learning_rate": 1.4339698492462312e-06, | |
| "loss": 1.0118, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 5.982537746429443, | |
| "learning_rate": 1.4138693467336683e-06, | |
| "loss": 1.0031, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 5.640321254730225, | |
| "learning_rate": 1.3937688442211056e-06, | |
| "loss": 0.9834, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 4.641716003417969, | |
| "learning_rate": 1.3738693467336682e-06, | |
| "loss": 1.0313, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 4.14018440246582, | |
| "learning_rate": 1.3537688442211056e-06, | |
| "loss": 1.0132, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 5.720813274383545, | |
| "learning_rate": 1.3336683417085427e-06, | |
| "loss": 1.0398, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 6.412359714508057, | |
| "learning_rate": 1.31356783919598e-06, | |
| "loss": 0.9922, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 6.955712795257568, | |
| "learning_rate": 1.2934673366834172e-06, | |
| "loss": 1.0396, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 9.27692699432373, | |
| "learning_rate": 1.2733668341708543e-06, | |
| "loss": 1.0883, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 5.846386909484863, | |
| "learning_rate": 1.2532663316582916e-06, | |
| "loss": 0.9981, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 7.700655937194824, | |
| "learning_rate": 1.2331658291457288e-06, | |
| "loss": 1.0766, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 6.861111640930176, | |
| "learning_rate": 1.213065326633166e-06, | |
| "loss": 1.0677, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 7.702520370483398, | |
| "learning_rate": 1.192964824120603e-06, | |
| "loss": 1.092, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 7.238519668579102, | |
| "learning_rate": 1.1728643216080404e-06, | |
| "loss": 1.0195, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 5.9995646476745605, | |
| "learning_rate": 1.1527638190954775e-06, | |
| "loss": 1.0125, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 8.999128341674805, | |
| "learning_rate": 1.1326633165829146e-06, | |
| "loss": 1.0342, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 8.37474536895752, | |
| "learning_rate": 1.112562814070352e-06, | |
| "loss": 1.0151, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 7.018558979034424, | |
| "learning_rate": 1.092462311557789e-06, | |
| "loss": 0.9854, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 5.188572883605957, | |
| "learning_rate": 1.0723618090452262e-06, | |
| "loss": 0.9934, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 5.260889530181885, | |
| "learning_rate": 1.0522613065326633e-06, | |
| "loss": 0.9982, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.045933723449707, | |
| "learning_rate": 1.0321608040201007e-06, | |
| "loss": 1.0473, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.305715560913086, | |
| "learning_rate": 1.0120603015075378e-06, | |
| "loss": 0.9831, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 7.344651222229004, | |
| "learning_rate": 9.91959798994975e-07, | |
| "loss": 1.0251, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 8.983135223388672, | |
| "learning_rate": 9.71859296482412e-07, | |
| "loss": 1.0084, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 8.123686790466309, | |
| "learning_rate": 9.51959798994975e-07, | |
| "loss": 1.0136, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 6.7493462562561035, | |
| "learning_rate": 9.318592964824122e-07, | |
| "loss": 0.9987, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 8.338164329528809, | |
| "learning_rate": 9.117587939698493e-07, | |
| "loss": 0.99, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 4.540625095367432, | |
| "learning_rate": 8.916582914572865e-07, | |
| "loss": 1.0015, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 4.909175872802734, | |
| "learning_rate": 8.715577889447237e-07, | |
| "loss": 1.0395, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 7.736073970794678, | |
| "learning_rate": 8.514572864321608e-07, | |
| "loss": 1.0005, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 5.298911094665527, | |
| "learning_rate": 8.313567839195981e-07, | |
| "loss": 1.0283, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 8.024383544921875, | |
| "learning_rate": 8.112562814070353e-07, | |
| "loss": 1.0152, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 6.19573974609375, | |
| "learning_rate": 7.911557788944723e-07, | |
| "loss": 1.0195, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 7.0770182609558105, | |
| "learning_rate": 7.710552763819096e-07, | |
| "loss": 1.0285, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 5.578373908996582, | |
| "learning_rate": 7.509547738693468e-07, | |
| "loss": 0.9974, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 6.602869033813477, | |
| "learning_rate": 7.30854271356784e-07, | |
| "loss": 1.0019, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 8.442864418029785, | |
| "learning_rate": 7.107537688442212e-07, | |
| "loss": 1.035, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 7.632187366485596, | |
| "learning_rate": 6.906532663316584e-07, | |
| "loss": 1.0316, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 9.547595024108887, | |
| "learning_rate": 6.705527638190955e-07, | |
| "loss": 0.9944, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 7.9466938972473145, | |
| "learning_rate": 6.504522613065326e-07, | |
| "loss": 1.0426, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 4.955949783325195, | |
| "learning_rate": 6.303517587939699e-07, | |
| "loss": 1.0797, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 8.598955154418945, | |
| "learning_rate": 6.102512562814071e-07, | |
| "loss": 1.0233, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 5.320690631866455, | |
| "learning_rate": 5.901507537688442e-07, | |
| "loss": 1.0028, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 7.243544101715088, | |
| "learning_rate": 5.700502512562815e-07, | |
| "loss": 0.9832, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 6.318332672119141, | |
| "learning_rate": 5.501507537688443e-07, | |
| "loss": 1.0295, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 5.297775745391846, | |
| "learning_rate": 5.300502512562814e-07, | |
| "loss": 1.0161, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 5.717208385467529, | |
| "learning_rate": 5.099497487437187e-07, | |
| "loss": 1.0508, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 6.545378684997559, | |
| "learning_rate": 4.898492462311558e-07, | |
| "loss": 1.039, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 7.0295867919921875, | |
| "learning_rate": 4.69748743718593e-07, | |
| "loss": 0.9963, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 6.685591697692871, | |
| "learning_rate": 4.4964824120603015e-07, | |
| "loss": 0.9845, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 8.319560050964355, | |
| "learning_rate": 4.295477386934674e-07, | |
| "loss": 0.9951, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 5.060975551605225, | |
| "learning_rate": 4.094472361809045e-07, | |
| "loss": 0.9893, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 7.493504524230957, | |
| "learning_rate": 3.8934673366834175e-07, | |
| "loss": 0.9722, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 6.903368949890137, | |
| "learning_rate": 3.6924623115577893e-07, | |
| "loss": 1.0248, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 7.836544036865234, | |
| "learning_rate": 3.4914572864321606e-07, | |
| "loss": 1.0075, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 7.111424446105957, | |
| "learning_rate": 3.290452261306533e-07, | |
| "loss": 0.9776, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 5.424601078033447, | |
| "learning_rate": 3.0894472361809047e-07, | |
| "loss": 1.0235, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 5.190994739532471, | |
| "learning_rate": 2.8884422110552765e-07, | |
| "loss": 1.0375, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 7.971541404724121, | |
| "learning_rate": 2.6874371859296483e-07, | |
| "loss": 1.0049, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 8.369765281677246, | |
| "learning_rate": 2.48643216080402e-07, | |
| "loss": 1.0028, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 5.76533842086792, | |
| "learning_rate": 2.2854271356783922e-07, | |
| "loss": 0.9663, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 8.707147598266602, | |
| "learning_rate": 2.084422110552764e-07, | |
| "loss": 1.0048, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 7.287750720977783, | |
| "learning_rate": 1.8834170854271358e-07, | |
| "loss": 0.9777, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 6.303613185882568, | |
| "learning_rate": 1.6824120603015078e-07, | |
| "loss": 1.0266, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 6.6418776512146, | |
| "learning_rate": 1.4814070351758796e-07, | |
| "loss": 1.0179, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 7.093784332275391, | |
| "learning_rate": 1.2824120603015077e-07, | |
| "loss": 1.0051, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 9.0274658203125, | |
| "learning_rate": 1.0814070351758795e-07, | |
| "loss": 1.0022, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 9.27753734588623, | |
| "learning_rate": 8.804020100502513e-08, | |
| "loss": 1.0176, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 7.9141364097595215, | |
| "learning_rate": 6.793969849246231e-08, | |
| "loss": 0.9695, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 6.786277770996094, | |
| "learning_rate": 4.7839195979899497e-08, | |
| "loss": 1.0346, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 6.178213119506836, | |
| "learning_rate": 2.7738693467336683e-08, | |
| "loss": 0.987, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 5.494964599609375, | |
| "learning_rate": 7.63819095477387e-09, | |
| "loss": 0.9787, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.162515640258789, | |
| "eval_runtime": 21.775, | |
| "eval_samples_per_second": 45.924, | |
| "eval_steps_per_second": 5.741, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10000, | |
| "total_flos": 1.1800273747968e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |