Invalid JSON: Unexpected token 'I', ..."_metric": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": Infinity, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 37500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.964, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.972, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.988, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.996, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_bleu": 0.12212197044710801, | |
| "eval_cer": 0.8160053321787701, | |
| "eval_f1": 0.35202235691264105, | |
| "eval_loss": NaN, | |
| "eval_meteor": 0.2636940499037806, | |
| "eval_runtime": 655.0147, | |
| "eval_samples_per_second": 7.633, | |
| "eval_steps_per_second": 0.24, | |
| "eval_wer": 0.9984162999487562, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.004, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.012, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.028, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.036, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.044, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.052, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.068, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.076, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.084, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.092, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.108, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.116, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.124, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.1320000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.148, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.156, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.164, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.172, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.188, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.196, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.204, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.212, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.228, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.236, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.244, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.252, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.268, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.276, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.284, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.292, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.308, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.316, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.324, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.332, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.3479999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.3559999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3639999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.3719999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.388, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.396, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.404, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.412, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.428, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.436, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.444, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.452, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.468, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.476, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.484, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.492, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.508, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.516, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.524, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.532, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.548, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.556, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.564, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.572, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.588, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.596, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.604, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.612, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.6280000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.6360000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.6440000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.6520000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.6680000000000001, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.6760000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6840000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.692, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.708, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.716, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.724, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.732, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.748, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.756, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.764, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.772, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.788, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.796, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.804, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.812, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.8279999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.8359999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.8439999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.8519999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.8679999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.876, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.884, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.892, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.908, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.916, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.924, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.932, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.948, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.956, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.964, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.972, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.988, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.996, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_bleu": 0.12212197044710801, | |
| "eval_cer": 0.8160053321787701, | |
| "eval_f1": 0.35202235691264105, | |
| "eval_loss": NaN, | |
| "eval_meteor": 0.2636940499037806, | |
| "eval_runtime": 659.7772, | |
| "eval_samples_per_second": 7.578, | |
| "eval_steps_per_second": 0.238, | |
| "eval_wer": 0.9984162999487562, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.004, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 2.012, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 2.028, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.036, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.044, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.052, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.068, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 2.076, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.084, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.092, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.108, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.116, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.124, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.132, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.148, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.156, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.164, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.172, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.188, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.196, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.204, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.212, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.228, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 2.2359999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.2439999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 2.252, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 2.268, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.276, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.284, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.292, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.308, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.316, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.324, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.332, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.348, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.356, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.364, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.372, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.388, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.396, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.404, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.412, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.428, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.436, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.444, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.452, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.468, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 2.476, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.484, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 2.492, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 2.508, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.516, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.524, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 2.532, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 2.548, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 2.556, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.564, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 2.572, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 2.588, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 2.596, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.604, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 2.612, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 2.628, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 2.636, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.644, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 2.648, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 2.652, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 2.664, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 2.668, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 2.676, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.684, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 2.692, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 2.708, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 2.716, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.724, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 2.732, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 2.748, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 2.7560000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.7640000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 2.7720000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 2.776, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 2.7800000000000002, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 2.7880000000000003, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 2.792, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 2.7960000000000003, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.8040000000000003, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 2.808, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 2.8120000000000003, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 2.824, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 2.828, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 2.836, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.844, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 2.852, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 2.856, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 2.868, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 2.872, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 2.876, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.884, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 2.888, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 2.892, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 2.904, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 2.908, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 2.916, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.924, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 2.932, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 2.936, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 2.948, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 2.952, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 2.956, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.964, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 2.968, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 2.972, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 2.984, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 2.988, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 2.996, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_bleu": 0.12212197044710801, | |
| "eval_cer": 0.8160053321787701, | |
| "eval_f1": 0.35202235691264105, | |
| "eval_loss": NaN, | |
| "eval_meteor": 0.2636940499037806, | |
| "eval_runtime": 666.2051, | |
| "eval_samples_per_second": 7.505, | |
| "eval_steps_per_second": 0.236, | |
| "eval_wer": 0.9984162999487562, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 37500, | |
| "total_flos": 1.614293713378345e+18, | |
| "train_loss": 0.0, | |
| "train_runtime": 18366.5428, | |
| "train_samples_per_second": 65.336, | |
| "train_steps_per_second": 2.042 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 37500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.614293713378345e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |