| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.062, | |
| "eval_steps": 1000, | |
| "global_step": 91000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 2e-06, | |
| "grad_norm": 29.506126403808594, | |
| "learning_rate": 0.0, | |
| "loss": 1.5091, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "grad_norm": 7.35781717300415, | |
| "learning_rate": 9.9e-07, | |
| "loss": 1.6562, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "grad_norm": 4.9180989265441895, | |
| "learning_rate": 1.99e-06, | |
| "loss": 1.6176, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "grad_norm": 1.8868086338043213, | |
| "learning_rate": 2.99e-06, | |
| "loss": 1.548, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 7.365355491638184, | |
| "learning_rate": 3.99e-06, | |
| "loss": 1.4958, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 8.965476989746094, | |
| "learning_rate": 4.9900000000000005e-06, | |
| "loss": 1.4918, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0012, | |
| "grad_norm": 2.2186834812164307, | |
| "learning_rate": 5.99e-06, | |
| "loss": 1.4807, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0014, | |
| "grad_norm": 1.970430850982666, | |
| "learning_rate": 6.990000000000001e-06, | |
| "loss": 1.4312, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 1.5914119482040405, | |
| "learning_rate": 7.99e-06, | |
| "loss": 1.3848, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0018, | |
| "grad_norm": 1.7615679502487183, | |
| "learning_rate": 8.99e-06, | |
| "loss": 1.4126, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 1.5981565713882446, | |
| "learning_rate": 9.990000000000001e-06, | |
| "loss": 1.3768, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 1.1488478183746338, | |
| "eval_runtime": 84.3931, | |
| "eval_samples_per_second": 182.989, | |
| "eval_steps_per_second": 2.868, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0022, | |
| "grad_norm": 1.9463247060775757, | |
| "learning_rate": 1.099e-05, | |
| "loss": 1.4649, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0024, | |
| "grad_norm": 1.997353434562683, | |
| "learning_rate": 1.199e-05, | |
| "loss": 1.422, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0026, | |
| "grad_norm": 2.028587818145752, | |
| "learning_rate": 1.299e-05, | |
| "loss": 1.4101, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0028, | |
| "grad_norm": 1.8055784702301025, | |
| "learning_rate": 1.399e-05, | |
| "loss": 1.379, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 2.630389451980591, | |
| "learning_rate": 1.499e-05, | |
| "loss": 1.3915, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 1.4471231698989868, | |
| "learning_rate": 1.599e-05, | |
| "loss": 1.3651, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0034, | |
| "grad_norm": 1.4115934371948242, | |
| "learning_rate": 1.699e-05, | |
| "loss": 1.3327, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0036, | |
| "grad_norm": 1.1099858283996582, | |
| "learning_rate": 1.7990000000000002e-05, | |
| "loss": 1.304, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0038, | |
| "grad_norm": 1.5767651796340942, | |
| "learning_rate": 1.8990000000000003e-05, | |
| "loss": 1.3375, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 1.3484268188476562, | |
| "learning_rate": 1.999e-05, | |
| "loss": 1.3746, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 1.1486531496047974, | |
| "eval_runtime": 76.1223, | |
| "eval_samples_per_second": 202.871, | |
| "eval_steps_per_second": 3.179, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0042, | |
| "grad_norm": 1.6412079334259033, | |
| "learning_rate": 2.099e-05, | |
| "loss": 1.3931, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0044, | |
| "grad_norm": 1.17317533493042, | |
| "learning_rate": 2.199e-05, | |
| "loss": 1.3512, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0046, | |
| "grad_norm": 0.8342074751853943, | |
| "learning_rate": 2.2990000000000002e-05, | |
| "loss": 1.3805, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 1.5843234062194824, | |
| "learning_rate": 2.3990000000000002e-05, | |
| "loss": 1.377, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 1.915511131286621, | |
| "learning_rate": 2.4990000000000003e-05, | |
| "loss": 1.3659, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0052, | |
| "grad_norm": 1.6507076025009155, | |
| "learning_rate": 2.5990000000000004e-05, | |
| "loss": 1.2875, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.0054, | |
| "grad_norm": 1.5680265426635742, | |
| "learning_rate": 2.6989999999999997e-05, | |
| "loss": 1.3402, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.0056, | |
| "grad_norm": 0.8005309700965881, | |
| "learning_rate": 2.7989999999999998e-05, | |
| "loss": 1.3565, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.0058, | |
| "grad_norm": 1.664014220237732, | |
| "learning_rate": 2.8990000000000002e-05, | |
| "loss": 1.3118, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 1.1597651243209839, | |
| "learning_rate": 2.9990000000000003e-05, | |
| "loss": 1.3207, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 1.1344993114471436, | |
| "eval_runtime": 76.5771, | |
| "eval_samples_per_second": 201.666, | |
| "eval_steps_per_second": 3.16, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.0062, | |
| "grad_norm": 1.6559661626815796, | |
| "learning_rate": 3.099e-05, | |
| "loss": 1.3103, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 1.390712857246399, | |
| "learning_rate": 3.1990000000000004e-05, | |
| "loss": 1.3855, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.0066, | |
| "grad_norm": 1.9980418682098389, | |
| "learning_rate": 3.299e-05, | |
| "loss": 1.3109, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.0068, | |
| "grad_norm": 1.2899682521820068, | |
| "learning_rate": 3.399e-05, | |
| "loss": 1.3219, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 1.44901704788208, | |
| "learning_rate": 3.499e-05, | |
| "loss": 1.3089, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0072, | |
| "grad_norm": 1.3377976417541504, | |
| "learning_rate": 3.599e-05, | |
| "loss": 1.2995, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.0074, | |
| "grad_norm": 1.5043129920959473, | |
| "learning_rate": 3.699e-05, | |
| "loss": 1.3421, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.0076, | |
| "grad_norm": 1.4387165307998657, | |
| "learning_rate": 3.799e-05, | |
| "loss": 1.3337, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0078, | |
| "grad_norm": 1.1607294082641602, | |
| "learning_rate": 3.8990000000000004e-05, | |
| "loss": 1.2852, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 1.0189259052276611, | |
| "learning_rate": 3.999e-05, | |
| "loss": 1.3277, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 1.1298929452896118, | |
| "eval_runtime": 76.4952, | |
| "eval_samples_per_second": 201.882, | |
| "eval_steps_per_second": 3.164, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.0082, | |
| "grad_norm": 1.6229581832885742, | |
| "learning_rate": 4.099e-05, | |
| "loss": 1.2878, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.0084, | |
| "grad_norm": 1.693702220916748, | |
| "learning_rate": 4.199e-05, | |
| "loss": 1.313, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.0086, | |
| "grad_norm": 1.169730544090271, | |
| "learning_rate": 4.299e-05, | |
| "loss": 1.2915, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.0088, | |
| "grad_norm": 1.3561712503433228, | |
| "learning_rate": 4.3990000000000004e-05, | |
| "loss": 1.3337, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 1.4713114500045776, | |
| "learning_rate": 4.499e-05, | |
| "loss": 1.309, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0092, | |
| "grad_norm": 1.0679044723510742, | |
| "learning_rate": 4.599e-05, | |
| "loss": 1.3464, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.0094, | |
| "grad_norm": 1.4595869779586792, | |
| "learning_rate": 4.699e-05, | |
| "loss": 1.3385, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 1.6443949937820435, | |
| "learning_rate": 4.799e-05, | |
| "loss": 1.3287, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.0098, | |
| "grad_norm": 1.3524634838104248, | |
| "learning_rate": 4.8990000000000004e-05, | |
| "loss": 1.3224, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.552986979484558, | |
| "learning_rate": 4.999e-05, | |
| "loss": 1.3256, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 1.1314986944198608, | |
| "eval_runtime": 76.3433, | |
| "eval_samples_per_second": 202.284, | |
| "eval_steps_per_second": 3.17, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.0102, | |
| "grad_norm": 1.1126846075057983, | |
| "learning_rate": 4.9999995065197964e-05, | |
| "loss": 1.3184, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.0104, | |
| "grad_norm": 0.8533400893211365, | |
| "learning_rate": 4.999998006090441e-05, | |
| "loss": 1.3145, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.0106, | |
| "grad_norm": 1.6032077074050903, | |
| "learning_rate": 4.9999954986621866e-05, | |
| "loss": 1.2894, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.0108, | |
| "grad_norm": 1.2594430446624756, | |
| "learning_rate": 4.999991984236044e-05, | |
| "loss": 1.2515, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 1.2169750928878784, | |
| "learning_rate": 4.99998746281343e-05, | |
| "loss": 1.2603, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 1.2038013935089111, | |
| "learning_rate": 4.999981934396165e-05, | |
| "loss": 1.3063, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.0114, | |
| "grad_norm": 1.1477010250091553, | |
| "learning_rate": 4.999975398986476e-05, | |
| "loss": 1.3057, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.0116, | |
| "grad_norm": 0.6725754141807556, | |
| "learning_rate": 4.9999678565869944e-05, | |
| "loss": 1.3211, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.0118, | |
| "grad_norm": 1.5470402240753174, | |
| "learning_rate": 4.99995930720076e-05, | |
| "loss": 1.2794, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 1.8079277276992798, | |
| "learning_rate": 4.999949750831215e-05, | |
| "loss": 1.2736, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 1.1335862874984741, | |
| "eval_runtime": 76.3508, | |
| "eval_samples_per_second": 202.264, | |
| "eval_steps_per_second": 3.17, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.0122, | |
| "grad_norm": 1.4117431640625, | |
| "learning_rate": 4.99993918748221e-05, | |
| "loss": 1.3142, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.0124, | |
| "grad_norm": 1.2657192945480347, | |
| "learning_rate": 4.999927617157998e-05, | |
| "loss": 1.3216, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.0126, | |
| "grad_norm": 1.0358809232711792, | |
| "learning_rate": 4.9999150398632425e-05, | |
| "loss": 1.329, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 1.6824450492858887, | |
| "learning_rate": 4.999901455603007e-05, | |
| "loss": 1.2911, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 1.5632168054580688, | |
| "learning_rate": 4.9998868643827635e-05, | |
| "loss": 1.3004, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.0132, | |
| "grad_norm": 1.254310131072998, | |
| "learning_rate": 4.99987126620839e-05, | |
| "loss": 1.2981, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.0134, | |
| "grad_norm": 1.4540060758590698, | |
| "learning_rate": 4.999854661086171e-05, | |
| "loss": 1.3184, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.0136, | |
| "grad_norm": 1.3684179782867432, | |
| "learning_rate": 4.999837049022792e-05, | |
| "loss": 1.2914, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.0138, | |
| "grad_norm": 1.474075436592102, | |
| "learning_rate": 4.999818430025349e-05, | |
| "loss": 1.2702, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.3687875270843506, | |
| "learning_rate": 4.999798804101341e-05, | |
| "loss": 1.2388, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 1.1258224248886108, | |
| "eval_runtime": 76.3516, | |
| "eval_samples_per_second": 202.262, | |
| "eval_steps_per_second": 3.17, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.0142, | |
| "grad_norm": 0.6668384075164795, | |
| "learning_rate": 4.999778171258675e-05, | |
| "loss": 1.2768, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 1.1303478479385376, | |
| "learning_rate": 4.9997565315056596e-05, | |
| "loss": 1.2639, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.0146, | |
| "grad_norm": 1.516221046447754, | |
| "learning_rate": 4.999733884851012e-05, | |
| "loss": 1.2805, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.0148, | |
| "grad_norm": 1.3124428987503052, | |
| "learning_rate": 4.9997102313038544e-05, | |
| "loss": 1.2811, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 1.390687346458435, | |
| "learning_rate": 4.999685570873715e-05, | |
| "loss": 1.2481, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.0152, | |
| "grad_norm": 0.8783305883407593, | |
| "learning_rate": 4.999659903570526e-05, | |
| "loss": 1.2986, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.0154, | |
| "grad_norm": 1.0741727352142334, | |
| "learning_rate": 4.999633229404628e-05, | |
| "loss": 1.2784, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.0156, | |
| "grad_norm": 1.022088885307312, | |
| "learning_rate": 4.999605548386763e-05, | |
| "loss": 1.2869, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.0158, | |
| "grad_norm": 1.0997594594955444, | |
| "learning_rate": 4.9995768605280826e-05, | |
| "loss": 1.2736, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.191188931465149, | |
| "learning_rate": 4.9995471658401414e-05, | |
| "loss": 1.256, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 1.1234357357025146, | |
| "eval_runtime": 76.115, | |
| "eval_samples_per_second": 202.89, | |
| "eval_steps_per_second": 3.179, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.0162, | |
| "grad_norm": 0.7304887175559998, | |
| "learning_rate": 4.9995164643349015e-05, | |
| "loss": 1.2717, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.0164, | |
| "grad_norm": 1.2335166931152344, | |
| "learning_rate": 4.9994847560247276e-05, | |
| "loss": 1.2657, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.0166, | |
| "grad_norm": 1.424973487854004, | |
| "learning_rate": 4.999452040922393e-05, | |
| "loss": 1.3235, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.0168, | |
| "grad_norm": 1.1544169187545776, | |
| "learning_rate": 4.999418319041076e-05, | |
| "loss": 1.2455, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 1.1393338441848755, | |
| "learning_rate": 4.9993835903943585e-05, | |
| "loss": 1.233, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.0172, | |
| "grad_norm": 1.1183439493179321, | |
| "learning_rate": 4.99934785499623e-05, | |
| "loss": 1.2282, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.0174, | |
| "grad_norm": 1.275148868560791, | |
| "learning_rate": 4.999311112861084e-05, | |
| "loss": 1.2665, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 1.4136372804641724, | |
| "learning_rate": 4.99927336400372e-05, | |
| "loss": 1.2617, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.0178, | |
| "grad_norm": 1.392327904701233, | |
| "learning_rate": 4.999234608439345e-05, | |
| "loss": 1.292, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 1.367475152015686, | |
| "learning_rate": 4.9991948461835685e-05, | |
| "loss": 1.2153, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 1.1127148866653442, | |
| "eval_runtime": 76.2524, | |
| "eval_samples_per_second": 202.525, | |
| "eval_steps_per_second": 3.174, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.0182, | |
| "grad_norm": 0.8793131709098816, | |
| "learning_rate": 4.999154077252407e-05, | |
| "loss": 1.2734, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.0184, | |
| "grad_norm": 0.6496739387512207, | |
| "learning_rate": 4.999112301662281e-05, | |
| "loss": 1.2498, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.0186, | |
| "grad_norm": 1.1462939977645874, | |
| "learning_rate": 4.99906951943002e-05, | |
| "loss": 1.2549, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.0188, | |
| "grad_norm": 1.520691156387329, | |
| "learning_rate": 4.999025730572854e-05, | |
| "loss": 1.2437, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 1.3555136919021606, | |
| "learning_rate": 4.998980935108424e-05, | |
| "loss": 1.2326, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 1.467217206954956, | |
| "learning_rate": 4.9989351330547715e-05, | |
| "loss": 1.2768, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.0194, | |
| "grad_norm": 1.3842765092849731, | |
| "learning_rate": 4.998888324430346e-05, | |
| "loss": 1.2675, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.0196, | |
| "grad_norm": 1.344078540802002, | |
| "learning_rate": 4.998840509254003e-05, | |
| "loss": 1.2619, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.0198, | |
| "grad_norm": 0.7567517757415771, | |
| "learning_rate": 4.998791687545001e-05, | |
| "loss": 1.2794, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9987697601318359, | |
| "learning_rate": 4.998741859323006e-05, | |
| "loss": 1.2778, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 1.1275579929351807, | |
| "eval_runtime": 76.2888, | |
| "eval_samples_per_second": 202.428, | |
| "eval_steps_per_second": 3.172, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.0202, | |
| "grad_norm": 1.5212323665618896, | |
| "learning_rate": 4.9986910246080894e-05, | |
| "loss": 1.2884, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.0204, | |
| "grad_norm": 1.5730245113372803, | |
| "learning_rate": 4.998639183420727e-05, | |
| "loss": 1.282, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.0206, | |
| "grad_norm": 0.8342368602752686, | |
| "learning_rate": 4.9985863357818e-05, | |
| "loss": 1.2408, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 1.3672316074371338, | |
| "learning_rate": 4.998532481712596e-05, | |
| "loss": 1.2205, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.1164605617523193, | |
| "learning_rate": 4.998477621234806e-05, | |
| "loss": 1.2817, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.0212, | |
| "grad_norm": 1.2867449522018433, | |
| "learning_rate": 4.99842175437053e-05, | |
| "loss": 1.2598, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.0214, | |
| "grad_norm": 1.6646244525909424, | |
| "learning_rate": 4.99836488114227e-05, | |
| "loss": 1.2163, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.0216, | |
| "grad_norm": 1.3233399391174316, | |
| "learning_rate": 4.998307001572935e-05, | |
| "loss": 1.2744, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.0218, | |
| "grad_norm": 1.1658077239990234, | |
| "learning_rate": 4.9982481156858385e-05, | |
| "loss": 1.274, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 1.4505467414855957, | |
| "learning_rate": 4.9981882235046995e-05, | |
| "loss": 1.2645, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 1.1138958930969238, | |
| "eval_runtime": 76.7643, | |
| "eval_samples_per_second": 201.174, | |
| "eval_steps_per_second": 3.153, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.0222, | |
| "grad_norm": 0.8515588641166687, | |
| "learning_rate": 4.998127325053642e-05, | |
| "loss": 1.2359, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 1.4022259712219238, | |
| "learning_rate": 4.9980654203571983e-05, | |
| "loss": 1.2515, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.0226, | |
| "grad_norm": 1.5902676582336426, | |
| "learning_rate": 4.998002509440301e-05, | |
| "loss": 1.2305, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.0228, | |
| "grad_norm": 0.763087809085846, | |
| "learning_rate": 4.997938592328292e-05, | |
| "loss": 1.2312, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 1.4949332475662231, | |
| "learning_rate": 4.997873669046916e-05, | |
| "loss": 1.2768, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.0232, | |
| "grad_norm": 1.0390666723251343, | |
| "learning_rate": 4.9978077396223255e-05, | |
| "loss": 1.2355, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.0234, | |
| "grad_norm": 0.6799549460411072, | |
| "learning_rate": 4.997740804081076e-05, | |
| "loss": 1.264, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.0236, | |
| "grad_norm": 1.4702496528625488, | |
| "learning_rate": 4.99767286245013e-05, | |
| "loss": 1.3092, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.0238, | |
| "grad_norm": 1.3574661016464233, | |
| "learning_rate": 4.997603914756853e-05, | |
| "loss": 1.2654, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.1170625686645508, | |
| "learning_rate": 4.9975339610290175e-05, | |
| "loss": 1.2343, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 1.1109821796417236, | |
| "eval_runtime": 76.4587, | |
| "eval_samples_per_second": 201.978, | |
| "eval_steps_per_second": 3.165, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.0242, | |
| "grad_norm": 1.2707583904266357, | |
| "learning_rate": 4.997463001294802e-05, | |
| "loss": 1.2525, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.0244, | |
| "grad_norm": 1.2613739967346191, | |
| "learning_rate": 4.997391035582788e-05, | |
| "loss": 1.2698, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.0246, | |
| "grad_norm": 1.1995183229446411, | |
| "learning_rate": 4.997318063921963e-05, | |
| "loss": 1.237, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.0248, | |
| "grad_norm": 0.729535698890686, | |
| "learning_rate": 4.997244086341721e-05, | |
| "loss": 1.2248, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.3250787258148193, | |
| "learning_rate": 4.9971691028718594e-05, | |
| "loss": 1.2617, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.0252, | |
| "grad_norm": 1.421278476715088, | |
| "learning_rate": 4.997093113542582e-05, | |
| "loss": 1.2321, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.0254, | |
| "grad_norm": 1.5168310403823853, | |
| "learning_rate": 4.997016118384497e-05, | |
| "loss": 1.2268, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.045483946800232, | |
| "learning_rate": 4.996938117428618e-05, | |
| "loss": 1.2714, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.0258, | |
| "grad_norm": 0.8379656076431274, | |
| "learning_rate": 4.9968591107063647e-05, | |
| "loss": 1.2792, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 1.620133638381958, | |
| "learning_rate": 4.996779098249559e-05, | |
| "loss": 1.2456, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 1.1081608533859253, | |
| "eval_runtime": 76.4734, | |
| "eval_samples_per_second": 201.939, | |
| "eval_steps_per_second": 3.164, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.0262, | |
| "grad_norm": 1.2181329727172852, | |
| "learning_rate": 4.9966980800904315e-05, | |
| "loss": 1.2187, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.0264, | |
| "grad_norm": 1.4935636520385742, | |
| "learning_rate": 4.996616056261616e-05, | |
| "loss": 1.2405, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.0266, | |
| "grad_norm": 1.3096436262130737, | |
| "learning_rate": 4.996533026796152e-05, | |
| "loss": 1.2599, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.0268, | |
| "grad_norm": 1.5392045974731445, | |
| "learning_rate": 4.996448991727483e-05, | |
| "loss": 1.2491, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 1.3175737857818604, | |
| "learning_rate": 4.996363951089459e-05, | |
| "loss": 1.2383, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 1.3839282989501953, | |
| "learning_rate": 4.9962779049163335e-05, | |
| "loss": 1.2739, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.0274, | |
| "grad_norm": 0.8403354287147522, | |
| "learning_rate": 4.996190853242767e-05, | |
| "loss": 1.2378, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.0276, | |
| "grad_norm": 1.2463191747665405, | |
| "learning_rate": 4.996102796103823e-05, | |
| "loss": 1.2248, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.0278, | |
| "grad_norm": 1.466070294380188, | |
| "learning_rate": 4.996013733534971e-05, | |
| "loss": 1.2567, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.8661775588989258, | |
| "learning_rate": 4.995923665572085e-05, | |
| "loss": 1.2372, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 1.113655686378479, | |
| "eval_runtime": 76.3727, | |
| "eval_samples_per_second": 202.206, | |
| "eval_steps_per_second": 3.169, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.0282, | |
| "grad_norm": 0.9262897968292236, | |
| "learning_rate": 4.9958325922514466e-05, | |
| "loss": 1.2082, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.0284, | |
| "grad_norm": 1.406928539276123, | |
| "learning_rate": 4.995740513609738e-05, | |
| "loss": 1.2576, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.0286, | |
| "grad_norm": 0.9858616590499878, | |
| "learning_rate": 4.9956474296840485e-05, | |
| "loss": 1.2173, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 0.6425116062164307, | |
| "learning_rate": 4.9955533405118725e-05, | |
| "loss": 1.237, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.7704317569732666, | |
| "learning_rate": 4.9954582461311106e-05, | |
| "loss": 1.286, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.0292, | |
| "grad_norm": 1.2745368480682373, | |
| "learning_rate": 4.995362146580065e-05, | |
| "loss": 1.2553, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.0294, | |
| "grad_norm": 1.1889222860336304, | |
| "learning_rate": 4.995265041897444e-05, | |
| "loss": 1.2783, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.0296, | |
| "grad_norm": 1.4223252534866333, | |
| "learning_rate": 4.9951669321223645e-05, | |
| "loss": 1.27, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.0298, | |
| "grad_norm": 1.0991147756576538, | |
| "learning_rate": 4.995067817294342e-05, | |
| "loss": 1.2373, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2834559679031372, | |
| "learning_rate": 4.994967697453301e-05, | |
| "loss": 1.2725, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 1.1147979497909546, | |
| "eval_runtime": 77.4863, | |
| "eval_samples_per_second": 199.3, | |
| "eval_steps_per_second": 3.123, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.0302, | |
| "grad_norm": 1.3690969944000244, | |
| "learning_rate": 4.9948665726395705e-05, | |
| "loss": 1.2631, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 1.0501981973648071, | |
| "learning_rate": 4.994764442893882e-05, | |
| "loss": 1.2614, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.0306, | |
| "grad_norm": 1.2085719108581543, | |
| "learning_rate": 4.994661308257375e-05, | |
| "loss": 1.1982, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.0308, | |
| "grad_norm": 1.1436259746551514, | |
| "learning_rate": 4.994557168771591e-05, | |
| "loss": 1.2079, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.8355712890625, | |
| "learning_rate": 4.994452024478478e-05, | |
| "loss": 1.2537, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.0312, | |
| "grad_norm": 0.9547547698020935, | |
| "learning_rate": 4.9943458754203875e-05, | |
| "loss": 1.2399, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.0314, | |
| "grad_norm": 1.090165138244629, | |
| "learning_rate": 4.994238721640077e-05, | |
| "loss": 1.2324, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.0316, | |
| "grad_norm": 0.9351906180381775, | |
| "learning_rate": 4.9941305631807076e-05, | |
| "loss": 1.2431, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.0318, | |
| "grad_norm": 1.3740676641464233, | |
| "learning_rate": 4.9940214000858456e-05, | |
| "loss": 1.2487, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.656019926071167, | |
| "learning_rate": 4.993911232399462e-05, | |
| "loss": 1.2371, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 1.1028244495391846, | |
| "eval_runtime": 76.4629, | |
| "eval_samples_per_second": 201.967, | |
| "eval_steps_per_second": 3.165, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.0322, | |
| "grad_norm": 1.20018470287323, | |
| "learning_rate": 4.9938000601659315e-05, | |
| "loss": 1.2547, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.0324, | |
| "grad_norm": 1.2216906547546387, | |
| "learning_rate": 4.993687883430036e-05, | |
| "loss": 1.2327, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.0326, | |
| "grad_norm": 1.0969616174697876, | |
| "learning_rate": 4.99357470223696e-05, | |
| "loss": 1.2513, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.0328, | |
| "grad_norm": 1.026194453239441, | |
| "learning_rate": 4.99346051663229e-05, | |
| "loss": 1.2508, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 1.1246017217636108, | |
| "learning_rate": 4.993345326662023e-05, | |
| "loss": 1.2538, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.0332, | |
| "grad_norm": 1.293093204498291, | |
| "learning_rate": 4.993229132372557e-05, | |
| "loss": 1.2236, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.0334, | |
| "grad_norm": 1.208122730255127, | |
| "learning_rate": 4.993111933810695e-05, | |
| "loss": 1.2753, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 1.073480248451233, | |
| "learning_rate": 4.992993731023643e-05, | |
| "loss": 1.2665, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.0338, | |
| "grad_norm": 1.4211028814315796, | |
| "learning_rate": 4.9928745240590146e-05, | |
| "loss": 1.2388, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 1.1787285804748535, | |
| "learning_rate": 4.992754312964827e-05, | |
| "loss": 1.2118, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 1.104814887046814, | |
| "eval_runtime": 76.4454, | |
| "eval_samples_per_second": 202.013, | |
| "eval_steps_per_second": 3.166, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.0342, | |
| "grad_norm": 0.9049177765846252, | |
| "learning_rate": 4.992633097789499e-05, | |
| "loss": 1.1995, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.0344, | |
| "grad_norm": 1.2447205781936646, | |
| "learning_rate": 4.992510878581858e-05, | |
| "loss": 1.2174, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.0346, | |
| "grad_norm": 1.0060733556747437, | |
| "learning_rate": 4.9923876553911334e-05, | |
| "loss": 1.2098, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.0348, | |
| "grad_norm": 1.3275829553604126, | |
| "learning_rate": 4.992263428266958e-05, | |
| "loss": 1.2256, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 1.3165931701660156, | |
| "learning_rate": 4.992138197259373e-05, | |
| "loss": 1.2276, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.2749327421188354, | |
| "learning_rate": 4.9920119624188196e-05, | |
| "loss": 1.2758, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.0354, | |
| "grad_norm": 1.0836033821105957, | |
| "learning_rate": 4.991884723796146e-05, | |
| "loss": 1.2407, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.0356, | |
| "grad_norm": 1.343475103378296, | |
| "learning_rate": 4.9917564814426034e-05, | |
| "loss": 1.2466, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.0358, | |
| "grad_norm": 1.3868790864944458, | |
| "learning_rate": 4.991627235409848e-05, | |
| "loss": 1.2402, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 1.5200074911117554, | |
| "learning_rate": 4.99149698574994e-05, | |
| "loss": 1.2183, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 1.0960842370986938, | |
| "eval_runtime": 76.481, | |
| "eval_samples_per_second": 201.92, | |
| "eval_steps_per_second": 3.164, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.0362, | |
| "grad_norm": 1.4647791385650635, | |
| "learning_rate": 4.991365732515345e-05, | |
| "loss": 1.2386, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.0364, | |
| "grad_norm": 0.9076351523399353, | |
| "learning_rate": 4.991233475758931e-05, | |
| "loss": 1.2011, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.0366, | |
| "grad_norm": 0.9813222289085388, | |
| "learning_rate": 4.99110021553397e-05, | |
| "loss": 1.214, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 1.5431565046310425, | |
| "learning_rate": 4.99096595189414e-05, | |
| "loss": 1.2206, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.9991932511329651, | |
| "learning_rate": 4.990830684893523e-05, | |
| "loss": 1.2334, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.0372, | |
| "grad_norm": 0.6322658658027649, | |
| "learning_rate": 4.9906944145866035e-05, | |
| "loss": 1.2354, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.0374, | |
| "grad_norm": 0.9555477499961853, | |
| "learning_rate": 4.990557141028272e-05, | |
| "loss": 1.2017, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.0376, | |
| "grad_norm": 1.171019196510315, | |
| "learning_rate": 4.990418864273822e-05, | |
| "loss": 1.286, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.0378, | |
| "grad_norm": 1.2275811433792114, | |
| "learning_rate": 4.990279584378951e-05, | |
| "loss": 1.2345, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 1.6589407920837402, | |
| "learning_rate": 4.9901393013997616e-05, | |
| "loss": 1.2376, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 1.107132077217102, | |
| "eval_runtime": 76.3932, | |
| "eval_samples_per_second": 202.152, | |
| "eval_steps_per_second": 3.168, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.0382, | |
| "grad_norm": 0.7907335758209229, | |
| "learning_rate": 4.9899980153927596e-05, | |
| "loss": 1.2554, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.4444235563278198, | |
| "learning_rate": 4.989855726414854e-05, | |
| "loss": 1.2618, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.0386, | |
| "grad_norm": 1.1591296195983887, | |
| "learning_rate": 4.98971243452336e-05, | |
| "loss": 1.2028, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.0388, | |
| "grad_norm": 0.9183579087257385, | |
| "learning_rate": 4.989568139775995e-05, | |
| "loss": 1.2259, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 1.0866785049438477, | |
| "learning_rate": 4.9894228422308805e-05, | |
| "loss": 1.2307, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.0392, | |
| "grad_norm": 1.5889687538146973, | |
| "learning_rate": 4.9892765419465436e-05, | |
| "loss": 1.2346, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.0394, | |
| "grad_norm": 1.300850510597229, | |
| "learning_rate": 4.989129238981913e-05, | |
| "loss": 1.2748, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.0396, | |
| "grad_norm": 1.2363704442977905, | |
| "learning_rate": 4.988980933396323e-05, | |
| "loss": 1.2536, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.0398, | |
| "grad_norm": 0.8141745328903198, | |
| "learning_rate": 4.9888316252495106e-05, | |
| "loss": 1.2198, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0759721994400024, | |
| "learning_rate": 4.988681314601617e-05, | |
| "loss": 1.2225, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 1.0962127447128296, | |
| "eval_runtime": 76.7102, | |
| "eval_samples_per_second": 201.316, | |
| "eval_steps_per_second": 3.155, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.0402, | |
| "grad_norm": 1.2748645544052124, | |
| "learning_rate": 4.988530001513187e-05, | |
| "loss": 1.2245, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.0404, | |
| "grad_norm": 1.3192243576049805, | |
| "learning_rate": 4.9883776860451704e-05, | |
| "loss": 1.2292, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.0406, | |
| "grad_norm": 1.329868197441101, | |
| "learning_rate": 4.98822436825892e-05, | |
| "loss": 1.2243, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.0408, | |
| "grad_norm": 1.3394356966018677, | |
| "learning_rate": 4.988070048216191e-05, | |
| "loss": 1.216, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 1.3035671710968018, | |
| "learning_rate": 4.987914725979144e-05, | |
| "loss": 1.2335, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.0412, | |
| "grad_norm": 1.2765480279922485, | |
| "learning_rate": 4.987758401610343e-05, | |
| "loss": 1.261, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.0414, | |
| "grad_norm": 1.0472270250320435, | |
| "learning_rate": 4.9876010751727553e-05, | |
| "loss": 1.2173, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.163237452507019, | |
| "learning_rate": 4.9874427467297525e-05, | |
| "loss": 1.2316, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.0418, | |
| "grad_norm": 1.3546457290649414, | |
| "learning_rate": 4.987283416345109e-05, | |
| "loss": 1.2268, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 1.0812748670578003, | |
| "learning_rate": 4.9871230840830016e-05, | |
| "loss": 1.2267, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 1.1046785116195679, | |
| "eval_runtime": 76.3631, | |
| "eval_samples_per_second": 202.231, | |
| "eval_steps_per_second": 3.169, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.0422, | |
| "grad_norm": 0.7458230257034302, | |
| "learning_rate": 4.986961750008014e-05, | |
| "loss": 1.1918, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.0424, | |
| "grad_norm": 1.2837951183319092, | |
| "learning_rate": 4.986799414185131e-05, | |
| "loss": 1.2206, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.0426, | |
| "grad_norm": 1.4213489294052124, | |
| "learning_rate": 4.986636076679742e-05, | |
| "loss": 1.2552, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.0428, | |
| "grad_norm": 1.297608733177185, | |
| "learning_rate": 4.986471737557638e-05, | |
| "loss": 1.2234, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 1.3617885112762451, | |
| "learning_rate": 4.986306396885015e-05, | |
| "loss": 1.2381, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 1.500025749206543, | |
| "learning_rate": 4.986140054728473e-05, | |
| "loss": 1.1957, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.0434, | |
| "grad_norm": 0.6222732663154602, | |
| "learning_rate": 4.9859727111550147e-05, | |
| "loss": 1.2579, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.0436, | |
| "grad_norm": 1.4154349565505981, | |
| "learning_rate": 4.985804366232045e-05, | |
| "loss": 1.2073, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.0438, | |
| "grad_norm": 1.334390640258789, | |
| "learning_rate": 4.9856350200273746e-05, | |
| "loss": 1.2317, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.8164774179458618, | |
| "learning_rate": 4.985464672609215e-05, | |
| "loss": 1.2248, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 1.1025385856628418, | |
| "eval_runtime": 76.8498, | |
| "eval_samples_per_second": 200.951, | |
| "eval_steps_per_second": 3.149, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.0442, | |
| "grad_norm": 1.1641725301742554, | |
| "learning_rate": 4.985293324046182e-05, | |
| "loss": 1.1928, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.0444, | |
| "grad_norm": 1.2185006141662598, | |
| "learning_rate": 4.9851209744072954e-05, | |
| "loss": 1.2435, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.0446, | |
| "grad_norm": 1.0973742008209229, | |
| "learning_rate": 4.9849476237619784e-05, | |
| "loss": 1.2515, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.0242998600006104, | |
| "learning_rate": 4.984773272180056e-05, | |
| "loss": 1.2511, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.598416805267334, | |
| "learning_rate": 4.984597919731755e-05, | |
| "loss": 1.215, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.0452, | |
| "grad_norm": 0.9391146302223206, | |
| "learning_rate": 4.98442156648771e-05, | |
| "loss": 1.2303, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.0454, | |
| "grad_norm": 0.9301611185073853, | |
| "learning_rate": 4.9842442125189556e-05, | |
| "loss": 1.2621, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.0456, | |
| "grad_norm": 1.3423951864242554, | |
| "learning_rate": 4.984065857896928e-05, | |
| "loss": 1.2251, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.0458, | |
| "grad_norm": 1.3373651504516602, | |
| "learning_rate": 4.983886502693471e-05, | |
| "loss": 1.2738, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 1.007158637046814, | |
| "learning_rate": 4.983706146980828e-05, | |
| "loss": 1.1923, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 1.1094993352890015, | |
| "eval_runtime": 76.6473, | |
| "eval_samples_per_second": 201.481, | |
| "eval_steps_per_second": 3.157, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.0462, | |
| "grad_norm": 0.7804542779922485, | |
| "learning_rate": 4.9835247908316454e-05, | |
| "loss": 1.2098, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 1.377008318901062, | |
| "learning_rate": 4.983342434318975e-05, | |
| "loss": 1.2202, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.0466, | |
| "grad_norm": 1.1037031412124634, | |
| "learning_rate": 4.983159077516268e-05, | |
| "loss": 1.1977, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.0468, | |
| "grad_norm": 0.7141278386116028, | |
| "learning_rate": 4.982974720497382e-05, | |
| "loss": 1.2054, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.570811927318573, | |
| "learning_rate": 4.9827893633365754e-05, | |
| "loss": 1.2163, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.0472, | |
| "grad_norm": 0.7255613803863525, | |
| "learning_rate": 4.98260300610851e-05, | |
| "loss": 1.2212, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.0474, | |
| "grad_norm": 0.8988520503044128, | |
| "learning_rate": 4.982415648888251e-05, | |
| "loss": 1.2332, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.0476, | |
| "grad_norm": 1.2191438674926758, | |
| "learning_rate": 4.9822272917512644e-05, | |
| "loss": 1.1974, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.0478, | |
| "grad_norm": 1.2043516635894775, | |
| "learning_rate": 4.982037934773423e-05, | |
| "loss": 1.2229, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.3503689765930176, | |
| "learning_rate": 4.981847578030998e-05, | |
| "loss": 1.2307, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 1.0969973802566528, | |
| "eval_runtime": 76.7433, | |
| "eval_samples_per_second": 201.229, | |
| "eval_steps_per_second": 3.153, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.0482, | |
| "grad_norm": 1.3795185089111328, | |
| "learning_rate": 4.9816562216006645e-05, | |
| "loss": 1.1894, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.0484, | |
| "grad_norm": 1.1966140270233154, | |
| "learning_rate": 4.9814638655595024e-05, | |
| "loss": 1.2011, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.0486, | |
| "grad_norm": 1.179077386856079, | |
| "learning_rate": 4.981270509984992e-05, | |
| "loss": 1.2596, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.0488, | |
| "grad_norm": 1.24593186378479, | |
| "learning_rate": 4.9810761549550166e-05, | |
| "loss": 1.2219, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 1.2809820175170898, | |
| "learning_rate": 4.9808808005478635e-05, | |
| "loss": 1.2033, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.0492, | |
| "grad_norm": 0.9016757011413574, | |
| "learning_rate": 4.9806844468422196e-05, | |
| "loss": 1.2394, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.0494, | |
| "grad_norm": 0.7064381837844849, | |
| "learning_rate": 4.9804870939171774e-05, | |
| "loss": 1.2154, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 0.626646101474762, | |
| "learning_rate": 4.980288741852231e-05, | |
| "loss": 1.2021, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.0498, | |
| "grad_norm": 1.049187421798706, | |
| "learning_rate": 4.980089390727275e-05, | |
| "loss": 1.1839, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.2987581491470337, | |
| "learning_rate": 4.97988904062261e-05, | |
| "loss": 1.1969, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 1.090114951133728, | |
| "eval_runtime": 77.5992, | |
| "eval_samples_per_second": 199.01, | |
| "eval_steps_per_second": 3.119, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.0502, | |
| "grad_norm": 1.105361819267273, | |
| "learning_rate": 4.979687691618936e-05, | |
| "loss": 1.1784, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.0504, | |
| "grad_norm": 0.7138956189155579, | |
| "learning_rate": 4.9794853437973555e-05, | |
| "loss": 1.2016, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.0506, | |
| "grad_norm": 1.250241756439209, | |
| "learning_rate": 4.9792819972393756e-05, | |
| "loss": 1.2032, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.0508, | |
| "grad_norm": 0.5875529050827026, | |
| "learning_rate": 4.9790776520269034e-05, | |
| "loss": 1.2034, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 1.2880475521087646, | |
| "learning_rate": 4.9788723082422495e-05, | |
| "loss": 1.2172, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.8775302767753601, | |
| "learning_rate": 4.978665965968127e-05, | |
| "loss": 1.2264, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.0514, | |
| "grad_norm": 0.7336851954460144, | |
| "learning_rate": 4.978458625287649e-05, | |
| "loss": 1.2248, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.0516, | |
| "grad_norm": 1.431084156036377, | |
| "learning_rate": 4.978250286284333e-05, | |
| "loss": 1.2353, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.0518, | |
| "grad_norm": 1.6342276334762573, | |
| "learning_rate": 4.978040949042099e-05, | |
| "loss": 1.1984, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 1.5883526802062988, | |
| "learning_rate": 4.977830613645266e-05, | |
| "loss": 1.2251, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 1.0901614427566528, | |
| "eval_runtime": 76.7254, | |
| "eval_samples_per_second": 201.276, | |
| "eval_steps_per_second": 3.154, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.0522, | |
| "grad_norm": 1.1527795791625977, | |
| "learning_rate": 4.977619280178558e-05, | |
| "loss": 1.2043, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.0524, | |
| "grad_norm": 1.5160431861877441, | |
| "learning_rate": 4.9774069487271014e-05, | |
| "loss": 1.1931, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.0526, | |
| "grad_norm": 1.2551748752593994, | |
| "learning_rate": 4.977193619376421e-05, | |
| "loss": 1.2397, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 1.2745076417922974, | |
| "learning_rate": 4.976979292212448e-05, | |
| "loss": 1.2336, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 1.4893673658370972, | |
| "learning_rate": 4.976763967321511e-05, | |
| "loss": 1.1827, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.0532, | |
| "grad_norm": 0.857379138469696, | |
| "learning_rate": 4.976547644790346e-05, | |
| "loss": 1.2441, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.0534, | |
| "grad_norm": 1.167006492614746, | |
| "learning_rate": 4.976330324706084e-05, | |
| "loss": 1.2779, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.0536, | |
| "grad_norm": 0.634842574596405, | |
| "learning_rate": 4.976112007156265e-05, | |
| "loss": 1.2828, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.0538, | |
| "grad_norm": 0.9239290952682495, | |
| "learning_rate": 4.975892692228825e-05, | |
| "loss": 1.2094, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 1.2031028270721436, | |
| "learning_rate": 4.9756723800121044e-05, | |
| "loss": 1.222, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 1.0867078304290771, | |
| "eval_runtime": 76.6606, | |
| "eval_samples_per_second": 201.446, | |
| "eval_steps_per_second": 3.157, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.0542, | |
| "grad_norm": 1.3575947284698486, | |
| "learning_rate": 4.9754510705948456e-05, | |
| "loss": 1.1622, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 1.142074465751648, | |
| "learning_rate": 4.975228764066191e-05, | |
| "loss": 1.2703, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.0546, | |
| "grad_norm": 0.8273721933364868, | |
| "learning_rate": 4.975005460515686e-05, | |
| "loss": 1.1921, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.0548, | |
| "grad_norm": 1.3859556913375854, | |
| "learning_rate": 4.974781160033278e-05, | |
| "loss": 1.2195, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 1.2232416868209839, | |
| "learning_rate": 4.974555862709315e-05, | |
| "loss": 1.1851, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.0552, | |
| "grad_norm": 0.7069573998451233, | |
| "learning_rate": 4.974329568634546e-05, | |
| "loss": 1.2098, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.0554, | |
| "grad_norm": 1.2497153282165527, | |
| "learning_rate": 4.974102277900122e-05, | |
| "loss": 1.206, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.0556, | |
| "grad_norm": 1.206449031829834, | |
| "learning_rate": 4.9738739905975976e-05, | |
| "loss": 1.2352, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.0558, | |
| "grad_norm": 1.3927749395370483, | |
| "learning_rate": 4.973644706818925e-05, | |
| "loss": 1.1952, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 1.3856321573257446, | |
| "learning_rate": 4.973414426656461e-05, | |
| "loss": 1.2499, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 1.0941141843795776, | |
| "eval_runtime": 76.7063, | |
| "eval_samples_per_second": 201.326, | |
| "eval_steps_per_second": 3.155, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.0562, | |
| "grad_norm": 0.6676329970359802, | |
| "learning_rate": 4.9731831502029606e-05, | |
| "loss": 1.2333, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.0564, | |
| "grad_norm": 1.2670732736587524, | |
| "learning_rate": 4.972950877551584e-05, | |
| "loss": 1.183, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.0566, | |
| "grad_norm": 1.2089595794677734, | |
| "learning_rate": 4.972717608795889e-05, | |
| "loss": 1.2445, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.0568, | |
| "grad_norm": 1.1897366046905518, | |
| "learning_rate": 4.972483344029838e-05, | |
| "loss": 1.2217, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 1.4963501691818237, | |
| "learning_rate": 4.97224808334779e-05, | |
| "loss": 1.2079, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.0572, | |
| "grad_norm": 1.594019889831543, | |
| "learning_rate": 4.972011826844511e-05, | |
| "loss": 1.1822, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.0574, | |
| "grad_norm": 1.3324779272079468, | |
| "learning_rate": 4.971774574615163e-05, | |
| "loss": 1.2562, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 1.3334344625473022, | |
| "learning_rate": 4.971536326755313e-05, | |
| "loss": 1.2509, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.0578, | |
| "grad_norm": 0.9475389719009399, | |
| "learning_rate": 4.971297083360925e-05, | |
| "loss": 1.1826, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.8067657947540283, | |
| "learning_rate": 4.971056844528368e-05, | |
| "loss": 1.1895, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 1.0870901346206665, | |
| "eval_runtime": 76.6141, | |
| "eval_samples_per_second": 201.569, | |
| "eval_steps_per_second": 3.159, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.0582, | |
| "grad_norm": 0.7364763617515564, | |
| "learning_rate": 4.970815610354409e-05, | |
| "loss": 1.1821, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.0584, | |
| "grad_norm": 1.494878888130188, | |
| "learning_rate": 4.970573380936218e-05, | |
| "loss": 1.1592, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.0586, | |
| "grad_norm": 0.7247675061225891, | |
| "learning_rate": 4.9703301563713645e-05, | |
| "loss": 1.2347, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.0588, | |
| "grad_norm": 1.0013625621795654, | |
| "learning_rate": 4.970085936757819e-05, | |
| "loss": 1.2536, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 1.012537956237793, | |
| "learning_rate": 4.969840722193955e-05, | |
| "loss": 1.2461, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 0.8702846169471741, | |
| "learning_rate": 4.969594512778541e-05, | |
| "loss": 1.2005, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.0594, | |
| "grad_norm": 1.1068499088287354, | |
| "learning_rate": 4.969347308610755e-05, | |
| "loss": 1.1942, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.0596, | |
| "grad_norm": 1.6333682537078857, | |
| "learning_rate": 4.969099109790167e-05, | |
| "loss": 1.2372, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.0598, | |
| "grad_norm": 1.0337685346603394, | |
| "learning_rate": 4.9688499164167536e-05, | |
| "loss": 1.2435, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8429011702537537, | |
| "learning_rate": 4.9685997285908894e-05, | |
| "loss": 1.2023, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 1.086748480796814, | |
| "eval_runtime": 76.8684, | |
| "eval_samples_per_second": 200.902, | |
| "eval_steps_per_second": 3.148, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "grad_norm": 0.8381020426750183, | |
| "learning_rate": 4.9683485464133484e-05, | |
| "loss": 1.2362, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "grad_norm": 0.6860467791557312, | |
| "learning_rate": 4.968096369985309e-05, | |
| "loss": 1.2125, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "grad_norm": 0.9316505193710327, | |
| "learning_rate": 4.967843199408347e-05, | |
| "loss": 1.1904, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 1.3389461040496826, | |
| "learning_rate": 4.967589034784439e-05, | |
| "loss": 1.2689, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 0.9387079477310181, | |
| "learning_rate": 4.967333876215963e-05, | |
| "loss": 1.2205, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.0012, | |
| "grad_norm": 0.7549923062324524, | |
| "learning_rate": 4.967077723805697e-05, | |
| "loss": 1.21, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.0014, | |
| "grad_norm": 1.1242858171463013, | |
| "learning_rate": 4.966820577656819e-05, | |
| "loss": 1.203, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 1.5065937042236328, | |
| "learning_rate": 4.966562437872907e-05, | |
| "loss": 1.2233, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.0018, | |
| "grad_norm": 1.1448508501052856, | |
| "learning_rate": 4.96630330455794e-05, | |
| "loss": 1.2242, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 0.7356053590774536, | |
| "learning_rate": 4.966043177816296e-05, | |
| "loss": 1.2541, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 1.0892270803451538, | |
| "eval_runtime": 78.1396, | |
| "eval_samples_per_second": 197.633, | |
| "eval_steps_per_second": 3.097, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.0022, | |
| "grad_norm": 1.290472149848938, | |
| "learning_rate": 4.965782057752757e-05, | |
| "loss": 1.2005, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.0024, | |
| "grad_norm": 0.7970076203346252, | |
| "learning_rate": 4.965519944472498e-05, | |
| "loss": 1.2718, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.0026, | |
| "grad_norm": 1.3415039777755737, | |
| "learning_rate": 4.9652568380811016e-05, | |
| "loss": 1.2673, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.0028, | |
| "grad_norm": 1.3146836757659912, | |
| "learning_rate": 4.9649927386845444e-05, | |
| "loss": 1.2717, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 0.9725894927978516, | |
| "learning_rate": 4.964727646389208e-05, | |
| "loss": 1.2418, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 0.9590099453926086, | |
| "learning_rate": 4.96446156130187e-05, | |
| "loss": 1.2389, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.0034, | |
| "grad_norm": 1.5478194952011108, | |
| "learning_rate": 4.964194483529709e-05, | |
| "loss": 1.2693, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.0036, | |
| "grad_norm": 0.7029865384101868, | |
| "learning_rate": 4.9639264131803056e-05, | |
| "loss": 1.25, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.0038, | |
| "grad_norm": 0.7784998416900635, | |
| "learning_rate": 4.963657350361637e-05, | |
| "loss": 1.2339, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 0.6479517817497253, | |
| "learning_rate": 4.963387295182083e-05, | |
| "loss": 1.2538, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 1.0948545932769775, | |
| "eval_runtime": 77.4713, | |
| "eval_samples_per_second": 199.338, | |
| "eval_steps_per_second": 3.124, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.0042, | |
| "grad_norm": 1.4759093523025513, | |
| "learning_rate": 4.963116247750421e-05, | |
| "loss": 1.2646, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.0044, | |
| "grad_norm": 0.7561829686164856, | |
| "learning_rate": 4.9628442081758285e-05, | |
| "loss": 1.2083, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.0046, | |
| "grad_norm": 0.6289774775505066, | |
| "learning_rate": 4.962571176567884e-05, | |
| "loss": 1.2492, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 0.8146848678588867, | |
| "learning_rate": 4.962297153036564e-05, | |
| "loss": 1.2693, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 1.1135525703430176, | |
| "learning_rate": 4.962022137692245e-05, | |
| "loss": 1.2218, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.0052, | |
| "grad_norm": 1.1507619619369507, | |
| "learning_rate": 4.961746130645703e-05, | |
| "loss": 1.2118, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.0054, | |
| "grad_norm": 0.8586376905441284, | |
| "learning_rate": 4.961469132008114e-05, | |
| "loss": 1.2115, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.0056, | |
| "grad_norm": 1.5335224866867065, | |
| "learning_rate": 4.961191141891054e-05, | |
| "loss": 1.2239, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.0058, | |
| "grad_norm": 1.2822892665863037, | |
| "learning_rate": 4.960912160406496e-05, | |
| "loss": 1.2443, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.9584761261940002, | |
| "learning_rate": 4.960632187666814e-05, | |
| "loss": 1.243, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 1.0964241027832031, | |
| "eval_runtime": 76.2571, | |
| "eval_samples_per_second": 202.512, | |
| "eval_steps_per_second": 3.173, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.0062, | |
| "grad_norm": 0.7512497305870056, | |
| "learning_rate": 4.960351223784781e-05, | |
| "loss": 1.1821, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 1.3305505514144897, | |
| "learning_rate": 4.960069268873568e-05, | |
| "loss": 1.2393, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.0066, | |
| "grad_norm": 1.5360506772994995, | |
| "learning_rate": 4.959786323046749e-05, | |
| "loss": 1.2475, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.0068, | |
| "grad_norm": 0.7005806565284729, | |
| "learning_rate": 4.959502386418293e-05, | |
| "loss": 1.2122, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 1.381052017211914, | |
| "learning_rate": 4.95921745910257e-05, | |
| "loss": 1.2336, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.0072, | |
| "grad_norm": 1.074300765991211, | |
| "learning_rate": 4.958931541214349e-05, | |
| "loss": 1.2661, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.0074, | |
| "grad_norm": 1.1441256999969482, | |
| "learning_rate": 4.9586446328687967e-05, | |
| "loss": 1.2296, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.0076, | |
| "grad_norm": 0.8737586140632629, | |
| "learning_rate": 4.958356734181481e-05, | |
| "loss": 1.2067, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.0078, | |
| "grad_norm": 1.1493791341781616, | |
| "learning_rate": 4.958067845268366e-05, | |
| "loss": 1.2643, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 1.3028621673583984, | |
| "learning_rate": 4.957777966245817e-05, | |
| "loss": 1.2427, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 1.096444845199585, | |
| "eval_runtime": 76.4253, | |
| "eval_samples_per_second": 202.067, | |
| "eval_steps_per_second": 3.166, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.0082, | |
| "grad_norm": 1.31423819065094, | |
| "learning_rate": 4.957487097230597e-05, | |
| "loss": 1.2137, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.0084, | |
| "grad_norm": 1.1846545934677124, | |
| "learning_rate": 4.957195238339868e-05, | |
| "loss": 1.2141, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.0086, | |
| "grad_norm": 0.9421952366828918, | |
| "learning_rate": 4.9569023896911914e-05, | |
| "loss": 1.219, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.0088, | |
| "grad_norm": 1.4107282161712646, | |
| "learning_rate": 4.9566085514025256e-05, | |
| "loss": 1.2141, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.7364057302474976, | |
| "learning_rate": 4.95631372359223e-05, | |
| "loss": 1.246, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.0092, | |
| "grad_norm": 0.8100732564926147, | |
| "learning_rate": 4.956017906379059e-05, | |
| "loss": 1.1891, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.0094, | |
| "grad_norm": 1.2455086708068848, | |
| "learning_rate": 4.955721099882169e-05, | |
| "loss": 1.2458, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 0.676437497138977, | |
| "learning_rate": 4.9554233042211146e-05, | |
| "loss": 1.2058, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.0098, | |
| "grad_norm": 1.3339647054672241, | |
| "learning_rate": 4.955124519515847e-05, | |
| "loss": 1.2407, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.9411395192146301, | |
| "learning_rate": 4.954824745886716e-05, | |
| "loss": 1.1974, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 1.0945005416870117, | |
| "eval_runtime": 76.9422, | |
| "eval_samples_per_second": 200.709, | |
| "eval_steps_per_second": 3.145, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.0102, | |
| "grad_norm": 0.6638602018356323, | |
| "learning_rate": 4.95452398345447e-05, | |
| "loss": 1.2259, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.0104, | |
| "grad_norm": 0.6337453722953796, | |
| "learning_rate": 4.954222232340259e-05, | |
| "loss": 1.1686, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.0106, | |
| "grad_norm": 0.809762179851532, | |
| "learning_rate": 4.953919492665625e-05, | |
| "loss": 1.2174, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.0108, | |
| "grad_norm": 0.9431924819946289, | |
| "learning_rate": 4.953615764552513e-05, | |
| "loss": 1.2128, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 0.7606577277183533, | |
| "learning_rate": 4.953311048123265e-05, | |
| "loss": 1.2473, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 1.1843669414520264, | |
| "learning_rate": 4.953005343500619e-05, | |
| "loss": 1.2194, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.0114, | |
| "grad_norm": 0.9086577296257019, | |
| "learning_rate": 4.952698650807715e-05, | |
| "loss": 1.2572, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.0116, | |
| "grad_norm": 1.36215078830719, | |
| "learning_rate": 4.9523909701680874e-05, | |
| "loss": 1.2263, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.0118, | |
| "grad_norm": 0.8537183403968811, | |
| "learning_rate": 4.952082301705671e-05, | |
| "loss": 1.2297, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.6182298064231873, | |
| "learning_rate": 4.9517726455447955e-05, | |
| "loss": 1.2101, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 1.0894391536712646, | |
| "eval_runtime": 76.3033, | |
| "eval_samples_per_second": 202.39, | |
| "eval_steps_per_second": 3.172, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.0122, | |
| "grad_norm": 1.1102640628814697, | |
| "learning_rate": 4.951462001810192e-05, | |
| "loss": 1.2086, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.0124, | |
| "grad_norm": 0.9391844868659973, | |
| "learning_rate": 4.951150370626988e-05, | |
| "loss": 1.2595, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.0126, | |
| "grad_norm": 1.3386393785476685, | |
| "learning_rate": 4.950837752120707e-05, | |
| "loss": 1.1953, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 1.0943065881729126, | |
| "learning_rate": 4.950524146417273e-05, | |
| "loss": 1.2759, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.9743318557739258, | |
| "learning_rate": 4.950209553643006e-05, | |
| "loss": 1.2421, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.0132, | |
| "grad_norm": 1.2555447816848755, | |
| "learning_rate": 4.949893973924623e-05, | |
| "loss": 1.242, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.0134, | |
| "grad_norm": 1.3289902210235596, | |
| "learning_rate": 4.949577407389241e-05, | |
| "loss": 1.2337, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.0136, | |
| "grad_norm": 0.8806101679801941, | |
| "learning_rate": 4.949259854164372e-05, | |
| "loss": 1.244, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.0138, | |
| "grad_norm": 1.211584448814392, | |
| "learning_rate": 4.948941314377927e-05, | |
| "loss": 1.2344, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.6472032070159912, | |
| "learning_rate": 4.9486217881582134e-05, | |
| "loss": 1.1866, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 1.0940065383911133, | |
| "eval_runtime": 76.3383, | |
| "eval_samples_per_second": 202.297, | |
| "eval_steps_per_second": 3.17, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.0142, | |
| "grad_norm": 1.5010918378829956, | |
| "learning_rate": 4.948301275633936e-05, | |
| "loss": 1.2057, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 0.6793572306632996, | |
| "learning_rate": 4.947979776934197e-05, | |
| "loss": 1.2104, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.0146, | |
| "grad_norm": 0.7654362916946411, | |
| "learning_rate": 4.947657292188498e-05, | |
| "loss": 1.2266, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.0148, | |
| "grad_norm": 1.0618220567703247, | |
| "learning_rate": 4.947333821526734e-05, | |
| "loss": 1.2509, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 1.2712790966033936, | |
| "learning_rate": 4.947009365079199e-05, | |
| "loss": 1.2179, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.0152, | |
| "grad_norm": 1.3342602252960205, | |
| "learning_rate": 4.946683922976584e-05, | |
| "loss": 1.2224, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.0154, | |
| "grad_norm": 0.8218332529067993, | |
| "learning_rate": 4.946357495349978e-05, | |
| "loss": 1.2402, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.0156, | |
| "grad_norm": 2.0291969776153564, | |
| "learning_rate": 4.946030082330865e-05, | |
| "loss": 1.1599, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.0158, | |
| "grad_norm": 1.5702838897705078, | |
| "learning_rate": 4.945701684051128e-05, | |
| "loss": 1.1784, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.268508791923523, | |
| "learning_rate": 4.9453723006430444e-05, | |
| "loss": 1.2172, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 1.088572382926941, | |
| "eval_runtime": 76.533, | |
| "eval_samples_per_second": 201.782, | |
| "eval_steps_per_second": 3.162, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.0162, | |
| "grad_norm": 1.3127037286758423, | |
| "learning_rate": 4.945041932239292e-05, | |
| "loss": 1.2299, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.0164, | |
| "grad_norm": 0.7277888655662537, | |
| "learning_rate": 4.9447105789729396e-05, | |
| "loss": 1.2655, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.0166, | |
| "grad_norm": 1.031909704208374, | |
| "learning_rate": 4.94437824097746e-05, | |
| "loss": 1.2179, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.0168, | |
| "grad_norm": 1.2462060451507568, | |
| "learning_rate": 4.9440449183867166e-05, | |
| "loss": 1.2311, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.5426816344261169, | |
| "learning_rate": 4.9437106113349716e-05, | |
| "loss": 1.1637, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.0172, | |
| "grad_norm": 1.2320595979690552, | |
| "learning_rate": 4.9433753199568856e-05, | |
| "loss": 1.2282, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.0174, | |
| "grad_norm": 0.928945779800415, | |
| "learning_rate": 4.943039044387513e-05, | |
| "loss": 1.1936, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 1.4080160856246948, | |
| "learning_rate": 4.9427017847623044e-05, | |
| "loss": 1.251, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.0178, | |
| "grad_norm": 1.3436859846115112, | |
| "learning_rate": 4.9423635412171106e-05, | |
| "loss": 1.287, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 0.9334709048271179, | |
| "learning_rate": 4.9420243138881734e-05, | |
| "loss": 1.1766, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 1.092005968093872, | |
| "eval_runtime": 76.2687, | |
| "eval_samples_per_second": 202.482, | |
| "eval_steps_per_second": 3.173, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.0182, | |
| "grad_norm": 0.9674895405769348, | |
| "learning_rate": 4.9416841029121355e-05, | |
| "loss": 1.2388, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.0184, | |
| "grad_norm": 1.3673955202102661, | |
| "learning_rate": 4.941342908426032e-05, | |
| "loss": 1.183, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.0186, | |
| "grad_norm": 0.8423133492469788, | |
| "learning_rate": 4.941000730567297e-05, | |
| "loss": 1.1847, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.0188, | |
| "grad_norm": 0.9814749360084534, | |
| "learning_rate": 4.94065756947376e-05, | |
| "loss": 1.2022, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 1.125647783279419, | |
| "learning_rate": 4.9403134252836456e-05, | |
| "loss": 1.1966, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 0.8501796722412109, | |
| "learning_rate": 4.9399682981355755e-05, | |
| "loss": 1.2347, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.0194, | |
| "grad_norm": 0.8226144909858704, | |
| "learning_rate": 4.9396221881685665e-05, | |
| "loss": 1.2129, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.0196, | |
| "grad_norm": 0.9265516400337219, | |
| "learning_rate": 4.939275095522032e-05, | |
| "loss": 1.1917, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.0198, | |
| "grad_norm": 0.8538194298744202, | |
| "learning_rate": 4.938927020335781e-05, | |
| "loss": 1.2548, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.2129065990447998, | |
| "learning_rate": 4.9385779627500174e-05, | |
| "loss": 1.2219, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 1.087021827697754, | |
| "eval_runtime": 76.3535, | |
| "eval_samples_per_second": 202.257, | |
| "eval_steps_per_second": 3.169, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.0202, | |
| "grad_norm": 1.2157970666885376, | |
| "learning_rate": 4.938227922905342e-05, | |
| "loss": 1.1623, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.0204, | |
| "grad_norm": 0.6873258948326111, | |
| "learning_rate": 4.9378769009427515e-05, | |
| "loss": 1.2088, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.0206, | |
| "grad_norm": 1.139224886894226, | |
| "learning_rate": 4.937524897003637e-05, | |
| "loss": 1.2158, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 1.2190488576889038, | |
| "learning_rate": 4.9371719112297845e-05, | |
| "loss": 1.19, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.2439500093460083, | |
| "learning_rate": 4.936817943763378e-05, | |
| "loss": 1.173, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.0212, | |
| "grad_norm": 1.030110478401184, | |
| "learning_rate": 4.936462994746995e-05, | |
| "loss": 1.1995, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.0214, | |
| "grad_norm": 0.666333794593811, | |
| "learning_rate": 4.93610706432361e-05, | |
| "loss": 1.2476, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.0216, | |
| "grad_norm": 0.8477672934532166, | |
| "learning_rate": 4.93575015263659e-05, | |
| "loss": 1.2225, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.0218, | |
| "grad_norm": 1.087173342704773, | |
| "learning_rate": 4.9353922598296995e-05, | |
| "loss": 1.1758, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 1.2760623693466187, | |
| "learning_rate": 4.935033386047099e-05, | |
| "loss": 1.2811, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 1.082631230354309, | |
| "eval_runtime": 75.9811, | |
| "eval_samples_per_second": 203.248, | |
| "eval_steps_per_second": 3.185, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.0222, | |
| "grad_norm": 1.0236754417419434, | |
| "learning_rate": 4.934673531433341e-05, | |
| "loss": 1.2283, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 1.509448766708374, | |
| "learning_rate": 4.934312696133376e-05, | |
| "loss": 1.1989, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.0226, | |
| "grad_norm": 1.2022035121917725, | |
| "learning_rate": 4.9339508802925475e-05, | |
| "loss": 1.2247, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.0228, | |
| "grad_norm": 1.4019054174423218, | |
| "learning_rate": 4.933588084056596e-05, | |
| "loss": 1.2201, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 1.06856107711792, | |
| "learning_rate": 4.933224307571655e-05, | |
| "loss": 1.1789, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.0232, | |
| "grad_norm": 1.0807596445083618, | |
| "learning_rate": 4.932859550984255e-05, | |
| "loss": 1.2361, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.0234, | |
| "grad_norm": 1.20824134349823, | |
| "learning_rate": 4.932493814441318e-05, | |
| "loss": 1.2167, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.0236, | |
| "grad_norm": 0.7066964507102966, | |
| "learning_rate": 4.9321270980901635e-05, | |
| "loss": 1.1941, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.0238, | |
| "grad_norm": 0.7342857122421265, | |
| "learning_rate": 4.9317594020785044e-05, | |
| "loss": 1.1709, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.239176630973816, | |
| "learning_rate": 4.931390726554449e-05, | |
| "loss": 1.2238, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 1.0859261751174927, | |
| "eval_runtime": 76.6051, | |
| "eval_samples_per_second": 201.592, | |
| "eval_steps_per_second": 3.159, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.0242, | |
| "grad_norm": 0.9031541347503662, | |
| "learning_rate": 4.9310210716665003e-05, | |
| "loss": 1.1621, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.0244, | |
| "grad_norm": 0.744767963886261, | |
| "learning_rate": 4.930650437563554e-05, | |
| "loss": 1.21, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.0246, | |
| "grad_norm": 1.2594637870788574, | |
| "learning_rate": 4.9302788243949025e-05, | |
| "loss": 1.21, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.0248, | |
| "grad_norm": 0.67472243309021, | |
| "learning_rate": 4.929906232310231e-05, | |
| "loss": 1.1785, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.3947267532348633, | |
| "learning_rate": 4.92953266145962e-05, | |
| "loss": 1.1598, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.0252, | |
| "grad_norm": 0.7739892601966858, | |
| "learning_rate": 4.929158111993543e-05, | |
| "loss": 1.1492, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.0254, | |
| "grad_norm": 0.8620167970657349, | |
| "learning_rate": 4.9287825840628695e-05, | |
| "loss": 1.1863, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 0.7649038434028625, | |
| "learning_rate": 4.928406077818861e-05, | |
| "loss": 1.1782, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.0258, | |
| "grad_norm": 1.2743923664093018, | |
| "learning_rate": 4.9280285934131755e-05, | |
| "loss": 1.2254, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.6955134272575378, | |
| "learning_rate": 4.927650130997862e-05, | |
| "loss": 1.2254, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 1.0833112001419067, | |
| "eval_runtime": 77.475, | |
| "eval_samples_per_second": 199.329, | |
| "eval_steps_per_second": 3.124, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.0262, | |
| "grad_norm": 0.8997926115989685, | |
| "learning_rate": 4.927270690725367e-05, | |
| "loss": 1.1989, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.0264, | |
| "grad_norm": 1.3762701749801636, | |
| "learning_rate": 4.9268902727485276e-05, | |
| "loss": 1.1928, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.0266, | |
| "grad_norm": 0.7553657293319702, | |
| "learning_rate": 4.926508877220577e-05, | |
| "loss": 1.2266, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.0268, | |
| "grad_norm": 0.6331331133842468, | |
| "learning_rate": 4.92612650429514e-05, | |
| "loss": 1.2034, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.6229783892631531, | |
| "learning_rate": 4.925743154126238e-05, | |
| "loss": 1.2123, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 1.2101593017578125, | |
| "learning_rate": 4.9253588268682835e-05, | |
| "loss": 1.2473, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 0.0274, | |
| "grad_norm": 1.2178127765655518, | |
| "learning_rate": 4.924973522676083e-05, | |
| "loss": 1.2391, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 0.0276, | |
| "grad_norm": 1.4870595932006836, | |
| "learning_rate": 4.924587241704838e-05, | |
| "loss": 1.2358, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.0278, | |
| "grad_norm": 1.2042150497436523, | |
| "learning_rate": 4.924199984110142e-05, | |
| "loss": 1.1996, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 1.3220444917678833, | |
| "learning_rate": 4.923811750047982e-05, | |
| "loss": 1.2052, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 1.0859400033950806, | |
| "eval_runtime": 76.6882, | |
| "eval_samples_per_second": 201.374, | |
| "eval_steps_per_second": 3.156, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.0282, | |
| "grad_norm": 1.464141607284546, | |
| "learning_rate": 4.923422539674739e-05, | |
| "loss": 1.2326, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 0.0284, | |
| "grad_norm": 1.2406100034713745, | |
| "learning_rate": 4.923032353147187e-05, | |
| "loss": 1.2092, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 0.0286, | |
| "grad_norm": 0.9459540247917175, | |
| "learning_rate": 4.9226411906224935e-05, | |
| "loss": 1.2023, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 1.2143398523330688, | |
| "learning_rate": 4.922249052258217e-05, | |
| "loss": 1.2348, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 1.1002607345581055, | |
| "learning_rate": 4.921855938212312e-05, | |
| "loss": 1.1912, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.0292, | |
| "grad_norm": 1.169640302658081, | |
| "learning_rate": 4.921461848643126e-05, | |
| "loss": 1.1797, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 0.0294, | |
| "grad_norm": 1.2756543159484863, | |
| "learning_rate": 4.921066783709396e-05, | |
| "loss": 1.1691, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 0.0296, | |
| "grad_norm": 0.5525041222572327, | |
| "learning_rate": 4.920670743570255e-05, | |
| "loss": 1.2011, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 0.0298, | |
| "grad_norm": 0.7082927823066711, | |
| "learning_rate": 4.9202737283852284e-05, | |
| "loss": 1.1831, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.7773894667625427, | |
| "learning_rate": 4.919875738314233e-05, | |
| "loss": 1.1947, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 1.0890144109725952, | |
| "eval_runtime": 76.6594, | |
| "eval_samples_per_second": 201.45, | |
| "eval_steps_per_second": 3.157, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.0302, | |
| "grad_norm": 0.7057791352272034, | |
| "learning_rate": 4.91947677351758e-05, | |
| "loss": 1.2717, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 0.9837706685066223, | |
| "learning_rate": 4.919076834155971e-05, | |
| "loss": 1.206, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 0.0306, | |
| "grad_norm": 0.5716899633407593, | |
| "learning_rate": 4.918675920390504e-05, | |
| "loss": 1.2071, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 0.0308, | |
| "grad_norm": 0.6972540020942688, | |
| "learning_rate": 4.918274032382665e-05, | |
| "loss": 1.1761, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 1.4802424907684326, | |
| "learning_rate": 4.917871170294334e-05, | |
| "loss": 1.2109, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.0312, | |
| "grad_norm": 0.7575565576553345, | |
| "learning_rate": 4.9174673342877854e-05, | |
| "loss": 1.2169, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 0.0314, | |
| "grad_norm": 2.227360963821411, | |
| "learning_rate": 4.917062524525684e-05, | |
| "loss": 1.1657, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 0.0316, | |
| "grad_norm": 0.8020743727684021, | |
| "learning_rate": 4.916656741171086e-05, | |
| "loss": 1.2073, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 0.0318, | |
| "grad_norm": 1.1863917112350464, | |
| "learning_rate": 4.916249984387443e-05, | |
| "loss": 1.211, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.5976528525352478, | |
| "learning_rate": 4.915842254338594e-05, | |
| "loss": 1.2468, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 1.0842978954315186, | |
| "eval_runtime": 76.5369, | |
| "eval_samples_per_second": 201.772, | |
| "eval_steps_per_second": 3.162, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.0322, | |
| "grad_norm": 1.4908519983291626, | |
| "learning_rate": 4.915433551188774e-05, | |
| "loss": 1.1695, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 0.0324, | |
| "grad_norm": 1.1190279722213745, | |
| "learning_rate": 4.915023875102609e-05, | |
| "loss": 1.2017, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 0.0326, | |
| "grad_norm": 1.1334049701690674, | |
| "learning_rate": 4.914613226245115e-05, | |
| "loss": 1.2083, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 0.0328, | |
| "grad_norm": 0.6902172565460205, | |
| "learning_rate": 4.914201604781703e-05, | |
| "loss": 1.233, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.7509928941726685, | |
| "learning_rate": 4.913789010878174e-05, | |
| "loss": 1.2437, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.0332, | |
| "grad_norm": 1.4217336177825928, | |
| "learning_rate": 4.9133754447007185e-05, | |
| "loss": 1.1909, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 0.0334, | |
| "grad_norm": 1.212930679321289, | |
| "learning_rate": 4.912960906415923e-05, | |
| "loss": 1.1828, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 1.1408753395080566, | |
| "learning_rate": 4.912545396190763e-05, | |
| "loss": 1.2118, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 0.0338, | |
| "grad_norm": 0.649695634841919, | |
| "learning_rate": 4.9121289141926066e-05, | |
| "loss": 1.1877, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 1.4613287448883057, | |
| "learning_rate": 4.911711460589211e-05, | |
| "loss": 1.1977, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 1.0870256423950195, | |
| "eval_runtime": 76.7051, | |
| "eval_samples_per_second": 201.33, | |
| "eval_steps_per_second": 3.155, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.0342, | |
| "grad_norm": 1.1586204767227173, | |
| "learning_rate": 4.9112930355487284e-05, | |
| "loss": 1.2222, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 0.0344, | |
| "grad_norm": 1.220306158065796, | |
| "learning_rate": 4.910873639239699e-05, | |
| "loss": 1.1909, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 0.0346, | |
| "grad_norm": 0.589338481426239, | |
| "learning_rate": 4.910453271831056e-05, | |
| "loss": 1.2034, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 0.0348, | |
| "grad_norm": 1.4743396043777466, | |
| "learning_rate": 4.910031933492123e-05, | |
| "loss": 1.2019, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.6481319069862366, | |
| "learning_rate": 4.909609624392616e-05, | |
| "loss": 1.2107, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.1668992042541504, | |
| "learning_rate": 4.9091863447026404e-05, | |
| "loss": 1.2498, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 0.0354, | |
| "grad_norm": 1.115519404411316, | |
| "learning_rate": 4.908762094592693e-05, | |
| "loss": 1.206, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 0.0356, | |
| "grad_norm": 1.3867928981781006, | |
| "learning_rate": 4.908336874233662e-05, | |
| "loss": 1.2082, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 0.0358, | |
| "grad_norm": 0.6380243301391602, | |
| "learning_rate": 4.9079106837968264e-05, | |
| "loss": 1.1693, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 1.8375539779663086, | |
| "learning_rate": 4.907483523453855e-05, | |
| "loss": 1.1531, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 1.0780328512191772, | |
| "eval_runtime": 76.5805, | |
| "eval_samples_per_second": 201.657, | |
| "eval_steps_per_second": 3.16, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.0362, | |
| "grad_norm": 1.231332778930664, | |
| "learning_rate": 4.907055393376808e-05, | |
| "loss": 1.1618, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 0.0364, | |
| "grad_norm": 1.2306678295135498, | |
| "learning_rate": 4.906626293738137e-05, | |
| "loss": 1.2365, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 0.0366, | |
| "grad_norm": 1.057521104812622, | |
| "learning_rate": 4.906196224710683e-05, | |
| "loss": 1.1775, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 0.9679245352745056, | |
| "learning_rate": 4.905765186467677e-05, | |
| "loss": 1.2175, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 1.325900912284851, | |
| "learning_rate": 4.9053331791827404e-05, | |
| "loss": 1.1848, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.0372, | |
| "grad_norm": 1.3124104738235474, | |
| "learning_rate": 4.9049002030298887e-05, | |
| "loss": 1.1779, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 0.0374, | |
| "grad_norm": 1.7284040451049805, | |
| "learning_rate": 4.904466258183522e-05, | |
| "loss": 1.2144, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 0.0376, | |
| "grad_norm": 0.9314505457878113, | |
| "learning_rate": 4.904031344818434e-05, | |
| "loss": 1.219, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 0.0378, | |
| "grad_norm": 1.1688934564590454, | |
| "learning_rate": 4.903595463109808e-05, | |
| "loss": 1.2268, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 1.0910236835479736, | |
| "learning_rate": 4.903158613233216e-05, | |
| "loss": 1.2213, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 1.0866200923919678, | |
| "eval_runtime": 76.736, | |
| "eval_samples_per_second": 201.248, | |
| "eval_steps_per_second": 3.154, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.0382, | |
| "grad_norm": 1.0715341567993164, | |
| "learning_rate": 4.902720795364623e-05, | |
| "loss": 1.2007, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 0.6578232645988464, | |
| "learning_rate": 4.902282009680381e-05, | |
| "loss": 1.2078, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 0.0386, | |
| "grad_norm": 1.34630286693573, | |
| "learning_rate": 4.9018422563572326e-05, | |
| "loss": 1.1894, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 0.0388, | |
| "grad_norm": 1.1832722425460815, | |
| "learning_rate": 4.9014015355723104e-05, | |
| "loss": 1.1846, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.9175591468811035, | |
| "learning_rate": 4.900959847503137e-05, | |
| "loss": 1.1984, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.0392, | |
| "grad_norm": 1.077879548072815, | |
| "learning_rate": 4.9005171923276236e-05, | |
| "loss": 1.1868, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 0.0394, | |
| "grad_norm": 0.5999984741210938, | |
| "learning_rate": 4.900073570224073e-05, | |
| "loss": 1.1816, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 0.0396, | |
| "grad_norm": 1.24228835105896, | |
| "learning_rate": 4.899628981371175e-05, | |
| "loss": 1.191, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 0.0398, | |
| "grad_norm": 0.7666544318199158, | |
| "learning_rate": 4.899183425948011e-05, | |
| "loss": 1.1813, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.2996748685836792, | |
| "learning_rate": 4.8987369041340486e-05, | |
| "loss": 1.184, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 1.0817583799362183, | |
| "eval_runtime": 77.0972, | |
| "eval_samples_per_second": 200.306, | |
| "eval_steps_per_second": 3.139, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.0402, | |
| "grad_norm": 1.1717365980148315, | |
| "learning_rate": 4.898289416109149e-05, | |
| "loss": 1.1936, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 0.0404, | |
| "grad_norm": 1.3680170774459839, | |
| "learning_rate": 4.8978409620535595e-05, | |
| "loss": 1.2138, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 0.0406, | |
| "grad_norm": 1.6390254497528076, | |
| "learning_rate": 4.897391542147916e-05, | |
| "loss": 1.1883, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 0.0408, | |
| "grad_norm": 1.2523001432418823, | |
| "learning_rate": 4.896941156573247e-05, | |
| "loss": 1.2157, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 1.4317930936813354, | |
| "learning_rate": 4.896489805510966e-05, | |
| "loss": 1.1721, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.0412, | |
| "grad_norm": 0.9794881939888, | |
| "learning_rate": 4.896037489142879e-05, | |
| "loss": 1.2073, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 0.0414, | |
| "grad_norm": 0.8774665594100952, | |
| "learning_rate": 4.895584207651178e-05, | |
| "loss": 1.1934, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.421742558479309, | |
| "learning_rate": 4.895129961218444e-05, | |
| "loss": 1.2078, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 0.0418, | |
| "grad_norm": 1.0715827941894531, | |
| "learning_rate": 4.894674750027648e-05, | |
| "loss": 1.1713, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.7623746991157532, | |
| "learning_rate": 4.894218574262149e-05, | |
| "loss": 1.1779, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 1.0817545652389526, | |
| "eval_runtime": 76.5318, | |
| "eval_samples_per_second": 201.785, | |
| "eval_steps_per_second": 3.162, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.0422, | |
| "grad_norm": 0.710477888584137, | |
| "learning_rate": 4.893761434105695e-05, | |
| "loss": 1.1876, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 0.0424, | |
| "grad_norm": 1.244310736656189, | |
| "learning_rate": 4.893303329742421e-05, | |
| "loss": 1.2077, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 0.0426, | |
| "grad_norm": 1.6161651611328125, | |
| "learning_rate": 4.8928442613568535e-05, | |
| "loss": 1.1896, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 0.0428, | |
| "grad_norm": 1.0831233263015747, | |
| "learning_rate": 4.892384229133902e-05, | |
| "loss": 1.1904, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.8258353471755981, | |
| "learning_rate": 4.89192323325887e-05, | |
| "loss": 1.1906, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 0.7877621054649353, | |
| "learning_rate": 4.8914612739174456e-05, | |
| "loss": 1.1416, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 0.0434, | |
| "grad_norm": 1.2102254629135132, | |
| "learning_rate": 4.890998351295706e-05, | |
| "loss": 1.1782, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 0.0436, | |
| "grad_norm": 1.139289140701294, | |
| "learning_rate": 4.890534465580115e-05, | |
| "loss": 1.1471, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 0.0438, | |
| "grad_norm": 1.2521135807037354, | |
| "learning_rate": 4.890069616957529e-05, | |
| "loss": 1.206, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 1.3690674304962158, | |
| "learning_rate": 4.889603805615187e-05, | |
| "loss": 1.2328, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 1.0797057151794434, | |
| "eval_runtime": 76.4385, | |
| "eval_samples_per_second": 202.032, | |
| "eval_steps_per_second": 3.166, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.0442, | |
| "grad_norm": 1.2689367532730103, | |
| "learning_rate": 4.889137031740717e-05, | |
| "loss": 1.2189, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 0.0444, | |
| "grad_norm": 1.0029367208480835, | |
| "learning_rate": 4.888669295522137e-05, | |
| "loss": 1.1754, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 0.0446, | |
| "grad_norm": 0.6958720684051514, | |
| "learning_rate": 4.8882005971478504e-05, | |
| "loss": 1.1601, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.2337570190429688, | |
| "learning_rate": 4.887730936806648e-05, | |
| "loss": 1.2244, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 1.2311972379684448, | |
| "learning_rate": 4.8872603146877104e-05, | |
| "loss": 1.2031, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.0452, | |
| "grad_norm": 1.145331859588623, | |
| "learning_rate": 4.886788730980604e-05, | |
| "loss": 1.1947, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 0.0454, | |
| "grad_norm": 1.1688799858093262, | |
| "learning_rate": 4.886316185875282e-05, | |
| "loss": 1.1655, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 0.0456, | |
| "grad_norm": 1.2751972675323486, | |
| "learning_rate": 4.885842679562085e-05, | |
| "loss": 1.2038, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 0.0458, | |
| "grad_norm": 0.6860191822052002, | |
| "learning_rate": 4.8853682122317426e-05, | |
| "loss": 1.1922, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 1.4772953987121582, | |
| "learning_rate": 4.8848927840753695e-05, | |
| "loss": 1.1856, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 1.0836056470870972, | |
| "eval_runtime": 76.3679, | |
| "eval_samples_per_second": 202.218, | |
| "eval_steps_per_second": 3.169, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.0462, | |
| "grad_norm": 1.2491508722305298, | |
| "learning_rate": 4.884416395284468e-05, | |
| "loss": 1.1924, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 1.1689327955245972, | |
| "learning_rate": 4.883939046050928e-05, | |
| "loss": 1.1675, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 0.0466, | |
| "grad_norm": 1.0528528690338135, | |
| "learning_rate": 4.883460736567025e-05, | |
| "loss": 1.1879, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 0.0468, | |
| "grad_norm": 1.141653060913086, | |
| "learning_rate": 4.8829814670254226e-05, | |
| "loss": 1.1637, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.8094840049743652, | |
| "learning_rate": 4.88250123761917e-05, | |
| "loss": 1.1924, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.0472, | |
| "grad_norm": 1.4988161325454712, | |
| "learning_rate": 4.8820200485417036e-05, | |
| "loss": 1.1962, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 0.0474, | |
| "grad_norm": 0.8497682809829712, | |
| "learning_rate": 4.881537899986847e-05, | |
| "loss": 1.1987, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 0.0476, | |
| "grad_norm": 1.0132189989089966, | |
| "learning_rate": 4.8810547921488083e-05, | |
| "loss": 1.1666, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 0.0478, | |
| "grad_norm": 1.275478720664978, | |
| "learning_rate": 4.8805707252221846e-05, | |
| "loss": 1.2072, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.1257511377334595, | |
| "learning_rate": 4.880085699401958e-05, | |
| "loss": 1.2128, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 1.081576943397522, | |
| "eval_runtime": 76.6431, | |
| "eval_samples_per_second": 201.492, | |
| "eval_steps_per_second": 3.157, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.0482, | |
| "grad_norm": 1.132750153541565, | |
| "learning_rate": 4.879599714883496e-05, | |
| "loss": 1.2239, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 0.0484, | |
| "grad_norm": 1.3854628801345825, | |
| "learning_rate": 4.8791127718625526e-05, | |
| "loss": 1.1447, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 0.0486, | |
| "grad_norm": 1.32233464717865, | |
| "learning_rate": 4.87862487053527e-05, | |
| "loss": 1.1765, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 0.0488, | |
| "grad_norm": 1.1571578979492188, | |
| "learning_rate": 4.8781360110981744e-05, | |
| "loss": 1.1844, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 1.552740216255188, | |
| "learning_rate": 4.877646193748177e-05, | |
| "loss": 1.1336, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.0492, | |
| "grad_norm": 1.3447420597076416, | |
| "learning_rate": 4.8771554186825774e-05, | |
| "loss": 1.2401, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 0.0494, | |
| "grad_norm": 1.0012767314910889, | |
| "learning_rate": 4.87666368609906e-05, | |
| "loss": 1.2236, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 1.1246662139892578, | |
| "learning_rate": 4.876170996195693e-05, | |
| "loss": 1.2452, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 0.0498, | |
| "grad_norm": 0.7534450888633728, | |
| "learning_rate": 4.875677349170934e-05, | |
| "loss": 1.2333, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.2943884134292603, | |
| "learning_rate": 4.875182745223622e-05, | |
| "loss": 1.1986, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 1.0774849653244019, | |
| "eval_runtime": 76.6003, | |
| "eval_samples_per_second": 201.605, | |
| "eval_steps_per_second": 3.159, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.0502, | |
| "grad_norm": 1.0771546363830566, | |
| "learning_rate": 4.874687184552984e-05, | |
| "loss": 1.2022, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 0.0504, | |
| "grad_norm": 1.1722393035888672, | |
| "learning_rate": 4.8741906673586334e-05, | |
| "loss": 1.1856, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 0.0506, | |
| "grad_norm": 0.7547242045402527, | |
| "learning_rate": 4.873693193840565e-05, | |
| "loss": 1.153, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 0.0508, | |
| "grad_norm": 0.9694270491600037, | |
| "learning_rate": 4.873194764199162e-05, | |
| "loss": 1.2389, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.6288232803344727, | |
| "learning_rate": 4.872695378635192e-05, | |
| "loss": 1.195, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 1.1400961875915527, | |
| "learning_rate": 4.872195037349807e-05, | |
| "loss": 1.1903, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 0.0514, | |
| "grad_norm": 1.0738123655319214, | |
| "learning_rate": 4.871693740544545e-05, | |
| "loss": 1.1764, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 0.0516, | |
| "grad_norm": 1.2298240661621094, | |
| "learning_rate": 4.871191488421327e-05, | |
| "loss": 1.1701, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 0.0518, | |
| "grad_norm": 1.3240865468978882, | |
| "learning_rate": 4.8706882811824624e-05, | |
| "loss": 1.1828, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 1.4167003631591797, | |
| "learning_rate": 4.870184119030641e-05, | |
| "loss": 1.204, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 1.0775164365768433, | |
| "eval_runtime": 76.8889, | |
| "eval_samples_per_second": 200.848, | |
| "eval_steps_per_second": 3.147, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.0522, | |
| "grad_norm": 0.6648851037025452, | |
| "learning_rate": 4.86967900216894e-05, | |
| "loss": 1.174, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 0.0524, | |
| "grad_norm": 1.29317307472229, | |
| "learning_rate": 4.8691729308008196e-05, | |
| "loss": 1.1695, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 0.0526, | |
| "grad_norm": 1.3121986389160156, | |
| "learning_rate": 4.868665905130127e-05, | |
| "loss": 1.1941, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 0.6604340672492981, | |
| "learning_rate": 4.868157925361091e-05, | |
| "loss": 1.1875, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 1.0366885662078857, | |
| "learning_rate": 4.867648991698325e-05, | |
| "loss": 1.2265, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.0532, | |
| "grad_norm": 1.382543683052063, | |
| "learning_rate": 4.867139104346829e-05, | |
| "loss": 1.2122, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 0.0534, | |
| "grad_norm": 1.0773979425430298, | |
| "learning_rate": 4.866628263511985e-05, | |
| "loss": 1.2375, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 0.0536, | |
| "grad_norm": 1.178758978843689, | |
| "learning_rate": 4.8661164693995584e-05, | |
| "loss": 1.1959, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 0.0538, | |
| "grad_norm": 0.7048764228820801, | |
| "learning_rate": 4.865603722215702e-05, | |
| "loss": 1.1841, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 1.3390711545944214, | |
| "learning_rate": 4.865090022166949e-05, | |
| "loss": 1.2033, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 1.0746017694473267, | |
| "eval_runtime": 77.1768, | |
| "eval_samples_per_second": 200.099, | |
| "eval_steps_per_second": 3.136, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.0542, | |
| "grad_norm": 1.0639598369598389, | |
| "learning_rate": 4.864575369460218e-05, | |
| "loss": 1.1948, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 1.1349152326583862, | |
| "learning_rate": 4.86405976430281e-05, | |
| "loss": 1.1666, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 0.0546, | |
| "grad_norm": 1.0187245607376099, | |
| "learning_rate": 4.8635432069024125e-05, | |
| "loss": 1.1614, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 0.0548, | |
| "grad_norm": 0.6468742489814758, | |
| "learning_rate": 4.863025697467094e-05, | |
| "loss": 1.2043, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 1.1298869848251343, | |
| "learning_rate": 4.862507236205307e-05, | |
| "loss": 1.1884, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.0552, | |
| "grad_norm": 0.7240111827850342, | |
| "learning_rate": 4.861987823325887e-05, | |
| "loss": 1.186, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 0.0554, | |
| "grad_norm": 0.8047366142272949, | |
| "learning_rate": 4.861467459038056e-05, | |
| "loss": 1.2029, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 0.0556, | |
| "grad_norm": 0.8840340375900269, | |
| "learning_rate": 4.860946143551413e-05, | |
| "loss": 1.19, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 0.0558, | |
| "grad_norm": 1.1575409173965454, | |
| "learning_rate": 4.860423877075947e-05, | |
| "loss": 1.1637, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.6591224074363708, | |
| "learning_rate": 4.859900659822025e-05, | |
| "loss": 1.2203, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 1.0788133144378662, | |
| "eval_runtime": 76.7654, | |
| "eval_samples_per_second": 201.171, | |
| "eval_steps_per_second": 3.152, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.0562, | |
| "grad_norm": 1.3405015468597412, | |
| "learning_rate": 4.859376492000399e-05, | |
| "loss": 1.19, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 0.0564, | |
| "grad_norm": 1.1912270784378052, | |
| "learning_rate": 4.858851373822205e-05, | |
| "loss": 1.1521, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 0.0566, | |
| "grad_norm": 1.0169751644134521, | |
| "learning_rate": 4.85832530549896e-05, | |
| "loss": 1.2054, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 0.0568, | |
| "grad_norm": 0.6713104248046875, | |
| "learning_rate": 4.857798287242563e-05, | |
| "loss": 1.2033, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 1.2116252183914185, | |
| "learning_rate": 4.857270319265298e-05, | |
| "loss": 1.1919, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.0572, | |
| "grad_norm": 0.9526674151420593, | |
| "learning_rate": 4.856741401779831e-05, | |
| "loss": 1.1724, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 0.0574, | |
| "grad_norm": 1.458253264427185, | |
| "learning_rate": 4.856211534999209e-05, | |
| "loss": 1.1479, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 1.173437237739563, | |
| "learning_rate": 4.855680719136862e-05, | |
| "loss": 1.2005, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 0.0578, | |
| "grad_norm": 0.7292013168334961, | |
| "learning_rate": 4.8551489544066034e-05, | |
| "loss": 1.1292, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.6017533540725708, | |
| "learning_rate": 4.854616241022627e-05, | |
| "loss": 1.1527, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 1.0688042640686035, | |
| "eval_runtime": 76.596, | |
| "eval_samples_per_second": 201.616, | |
| "eval_steps_per_second": 3.159, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.0582, | |
| "grad_norm": 0.8270254731178284, | |
| "learning_rate": 4.8540825791995114e-05, | |
| "loss": 1.1517, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 0.0584, | |
| "grad_norm": 1.1182663440704346, | |
| "learning_rate": 4.8535479691522136e-05, | |
| "loss": 1.2282, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 0.0586, | |
| "grad_norm": 1.1249291896820068, | |
| "learning_rate": 4.853012411096075e-05, | |
| "loss": 1.1314, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 0.0588, | |
| "grad_norm": 0.6025962233543396, | |
| "learning_rate": 4.85247590524682e-05, | |
| "loss": 1.1879, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 1.2914466857910156, | |
| "learning_rate": 4.85193845182055e-05, | |
| "loss": 1.1926, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 0.7965525388717651, | |
| "learning_rate": 4.8514000510337544e-05, | |
| "loss": 1.2344, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 0.0594, | |
| "grad_norm": 0.6595709323883057, | |
| "learning_rate": 4.850860703103298e-05, | |
| "loss": 1.2056, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 0.0596, | |
| "grad_norm": 0.783892035484314, | |
| "learning_rate": 4.850320408246433e-05, | |
| "loss": 1.1343, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 0.0598, | |
| "grad_norm": 0.948952853679657, | |
| "learning_rate": 4.849779166680788e-05, | |
| "loss": 1.1607, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.725027322769165, | |
| "learning_rate": 4.849236978624375e-05, | |
| "loss": 1.2125, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 1.0838971138000488, | |
| "eval_runtime": 76.8451, | |
| "eval_samples_per_second": 200.963, | |
| "eval_steps_per_second": 3.149, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "grad_norm": 1.1788556575775146, | |
| "learning_rate": 4.848693844295589e-05, | |
| "loss": 1.1917, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "grad_norm": 1.3381775617599487, | |
| "learning_rate": 4.848149763913202e-05, | |
| "loss": 1.2108, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "grad_norm": 0.9748820066452026, | |
| "learning_rate": 4.847604737696372e-05, | |
| "loss": 1.2054, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 1.3528317213058472, | |
| "learning_rate": 4.847058765864634e-05, | |
| "loss": 1.1582, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 1.0475611686706543, | |
| "learning_rate": 4.8465118486379065e-05, | |
| "loss": 1.1409, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.0012, | |
| "grad_norm": 0.667515754699707, | |
| "learning_rate": 4.8459639862364866e-05, | |
| "loss": 1.1548, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 0.0014, | |
| "grad_norm": 1.3529212474822998, | |
| "learning_rate": 4.845415178881053e-05, | |
| "loss": 1.1741, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 1.2415704727172852, | |
| "learning_rate": 4.844865426792666e-05, | |
| "loss": 1.1689, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 0.0018, | |
| "grad_norm": 0.9598329663276672, | |
| "learning_rate": 4.844314730192766e-05, | |
| "loss": 1.2138, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 0.660463273525238, | |
| "learning_rate": 4.843763089303173e-05, | |
| "loss": 1.1897, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 1.0804229974746704, | |
| "eval_runtime": 77.9042, | |
| "eval_samples_per_second": 198.231, | |
| "eval_steps_per_second": 3.106, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.0022, | |
| "grad_norm": 1.3137476444244385, | |
| "learning_rate": 4.843210504346088e-05, | |
| "loss": 1.2149, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 0.0024, | |
| "grad_norm": 2.466374158859253, | |
| "learning_rate": 4.842656975544092e-05, | |
| "loss": 1.2294, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 0.0026, | |
| "grad_norm": 0.9236732721328735, | |
| "learning_rate": 4.842102503120146e-05, | |
| "loss": 1.2316, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 0.0028, | |
| "grad_norm": 0.9453101754188538, | |
| "learning_rate": 4.841547087297592e-05, | |
| "loss": 1.1903, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 1.0694693326950073, | |
| "learning_rate": 4.840990728300151e-05, | |
| "loss": 1.2027, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 1.0661156177520752, | |
| "learning_rate": 4.8404334263519254e-05, | |
| "loss": 1.2268, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 0.0034, | |
| "grad_norm": 1.3803828954696655, | |
| "learning_rate": 4.839875181677394e-05, | |
| "loss": 1.2084, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 0.0036, | |
| "grad_norm": 0.896979033946991, | |
| "learning_rate": 4.839315994501421e-05, | |
| "loss": 1.1818, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 0.0038, | |
| "grad_norm": 1.1509560346603394, | |
| "learning_rate": 4.8387558650492446e-05, | |
| "loss": 1.226, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 1.2490339279174805, | |
| "learning_rate": 4.8381947935464854e-05, | |
| "loss": 1.2283, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 1.086965560913086, | |
| "eval_runtime": 75.4991, | |
| "eval_samples_per_second": 204.545, | |
| "eval_steps_per_second": 3.205, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.0042, | |
| "grad_norm": 1.0047966241836548, | |
| "learning_rate": 4.837632780219142e-05, | |
| "loss": 1.2006, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 0.0044, | |
| "grad_norm": 1.3791793584823608, | |
| "learning_rate": 4.837069825293596e-05, | |
| "loss": 1.2191, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 0.0046, | |
| "grad_norm": 1.4083282947540283, | |
| "learning_rate": 4.836505928996603e-05, | |
| "loss": 1.2232, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 1.5420063734054565, | |
| "learning_rate": 4.835941091555301e-05, | |
| "loss": 1.2281, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.7661809921264648, | |
| "learning_rate": 4.8353753131972066e-05, | |
| "loss": 1.2262, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.0052, | |
| "grad_norm": 0.5983784198760986, | |
| "learning_rate": 4.8348085941502164e-05, | |
| "loss": 1.2203, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 0.0054, | |
| "grad_norm": 0.8108716011047363, | |
| "learning_rate": 4.8342409346426024e-05, | |
| "loss": 1.1536, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 0.0056, | |
| "grad_norm": 0.9011421203613281, | |
| "learning_rate": 4.83367233490302e-05, | |
| "loss": 1.2214, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 0.0058, | |
| "grad_norm": 0.6667259335517883, | |
| "learning_rate": 4.8331027951604995e-05, | |
| "loss": 1.1932, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 1.2265853881835938, | |
| "learning_rate": 4.8325323156444525e-05, | |
| "loss": 1.235, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 1.0849037170410156, | |
| "eval_runtime": 76.5492, | |
| "eval_samples_per_second": 201.74, | |
| "eval_steps_per_second": 3.161, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.0062, | |
| "grad_norm": 1.468518614768982, | |
| "learning_rate": 4.831960896584667e-05, | |
| "loss": 1.1886, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 1.2378790378570557, | |
| "learning_rate": 4.831388538211312e-05, | |
| "loss": 1.1983, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 0.0066, | |
| "grad_norm": 1.2989089488983154, | |
| "learning_rate": 4.830815240754933e-05, | |
| "loss": 1.1894, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 0.0068, | |
| "grad_norm": 1.3696600198745728, | |
| "learning_rate": 4.830241004446453e-05, | |
| "loss": 1.1798, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 1.3715136051177979, | |
| "learning_rate": 4.829665829517175e-05, | |
| "loss": 1.2323, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.0072, | |
| "grad_norm": 0.7888614535331726, | |
| "learning_rate": 4.82908971619878e-05, | |
| "loss": 1.2098, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 0.0074, | |
| "grad_norm": 1.0456979274749756, | |
| "learning_rate": 4.828512664723326e-05, | |
| "loss": 1.21, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 0.0076, | |
| "grad_norm": 1.4525970220565796, | |
| "learning_rate": 4.827934675323248e-05, | |
| "loss": 1.191, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 0.0078, | |
| "grad_norm": 1.6751583814620972, | |
| "learning_rate": 4.8273557482313625e-05, | |
| "loss": 1.2084, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.7282904982566833, | |
| "learning_rate": 4.826775883680859e-05, | |
| "loss": 1.2376, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 1.0806148052215576, | |
| "eval_runtime": 75.7629, | |
| "eval_samples_per_second": 203.833, | |
| "eval_steps_per_second": 3.194, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.0082, | |
| "grad_norm": 1.0859407186508179, | |
| "learning_rate": 4.826195081905308e-05, | |
| "loss": 1.1807, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 0.0084, | |
| "grad_norm": 1.3917006254196167, | |
| "learning_rate": 4.8256133431386566e-05, | |
| "loss": 1.2012, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 0.0086, | |
| "grad_norm": 1.4448059797286987, | |
| "learning_rate": 4.825030667615228e-05, | |
| "loss": 1.2305, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 0.0088, | |
| "grad_norm": 1.0721293687820435, | |
| "learning_rate": 4.824447055569725e-05, | |
| "loss": 1.2332, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.9940403699874878, | |
| "learning_rate": 4.823862507237226e-05, | |
| "loss": 1.2096, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.0092, | |
| "grad_norm": 1.5473828315734863, | |
| "learning_rate": 4.823277022853187e-05, | |
| "loss": 1.1706, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 0.0094, | |
| "grad_norm": 1.3127409219741821, | |
| "learning_rate": 4.822690602653441e-05, | |
| "loss": 1.2051, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 1.7532451152801514, | |
| "learning_rate": 4.822103246874198e-05, | |
| "loss": 1.196, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 0.0098, | |
| "grad_norm": 0.8706884980201721, | |
| "learning_rate": 4.8215149557520446e-05, | |
| "loss": 1.1862, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5764431953430176, | |
| "learning_rate": 4.8209257295239455e-05, | |
| "loss": 1.2257, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 1.0817573070526123, | |
| "eval_runtime": 75.771, | |
| "eval_samples_per_second": 203.811, | |
| "eval_steps_per_second": 3.194, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.0102, | |
| "grad_norm": 1.467939019203186, | |
| "learning_rate": 4.820335568427239e-05, | |
| "loss": 1.2317, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 0.0104, | |
| "grad_norm": 1.270477533340454, | |
| "learning_rate": 4.819744472699643e-05, | |
| "loss": 1.2308, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 0.0106, | |
| "grad_norm": 1.073867917060852, | |
| "learning_rate": 4.8191524425792526e-05, | |
| "loss": 1.1991, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 0.0108, | |
| "grad_norm": 1.0844908952713013, | |
| "learning_rate": 4.818559478304534e-05, | |
| "loss": 1.1914, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 1.282365322113037, | |
| "learning_rate": 4.817965580114335e-05, | |
| "loss": 1.2035, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 1.3751475811004639, | |
| "learning_rate": 4.817370748247878e-05, | |
| "loss": 1.215, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 0.0114, | |
| "grad_norm": 1.484107255935669, | |
| "learning_rate": 4.81677498294476e-05, | |
| "loss": 1.2298, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 0.0116, | |
| "grad_norm": 1.326803207397461, | |
| "learning_rate": 4.8161782844449566e-05, | |
| "loss": 1.1794, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 0.0118, | |
| "grad_norm": 1.6823039054870605, | |
| "learning_rate": 4.815580652988817e-05, | |
| "loss": 1.1896, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 1.1735076904296875, | |
| "learning_rate": 4.8149820888170673e-05, | |
| "loss": 1.2089, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 1.081894874572754, | |
| "eval_runtime": 75.5115, | |
| "eval_samples_per_second": 204.512, | |
| "eval_steps_per_second": 3.205, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.0122, | |
| "grad_norm": 1.0032376050949097, | |
| "learning_rate": 4.814382592170808e-05, | |
| "loss": 1.2197, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 0.0124, | |
| "grad_norm": 1.2638306617736816, | |
| "learning_rate": 4.813782163291519e-05, | |
| "loss": 1.2009, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 0.0126, | |
| "grad_norm": 1.2233041524887085, | |
| "learning_rate": 4.813180802421051e-05, | |
| "loss": 1.2069, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 0.857825756072998, | |
| "learning_rate": 4.812578509801632e-05, | |
| "loss": 1.1942, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.8879494667053223, | |
| "learning_rate": 4.811975285675866e-05, | |
| "loss": 1.1689, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.0132, | |
| "grad_norm": 1.3842177391052246, | |
| "learning_rate": 4.811371130286731e-05, | |
| "loss": 1.1941, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 0.0134, | |
| "grad_norm": 1.303063988685608, | |
| "learning_rate": 4.810766043877582e-05, | |
| "loss": 1.194, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 0.0136, | |
| "grad_norm": 1.3135032653808594, | |
| "learning_rate": 4.810160026692147e-05, | |
| "loss": 1.1536, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 0.0138, | |
| "grad_norm": 0.8059789538383484, | |
| "learning_rate": 4.809553078974528e-05, | |
| "loss": 1.2083, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.493458867073059, | |
| "learning_rate": 4.808945200969206e-05, | |
| "loss": 1.2031, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 1.0807029008865356, | |
| "eval_runtime": 76.4746, | |
| "eval_samples_per_second": 201.936, | |
| "eval_steps_per_second": 3.164, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.0142, | |
| "grad_norm": 0.9932582378387451, | |
| "learning_rate": 4.808336392921033e-05, | |
| "loss": 1.1932, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 1.1588648557662964, | |
| "learning_rate": 4.807726655075237e-05, | |
| "loss": 1.2004, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 0.0146, | |
| "grad_norm": 0.713295042514801, | |
| "learning_rate": 4.80711598767742e-05, | |
| "loss": 1.1336, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 0.0148, | |
| "grad_norm": 1.474853277206421, | |
| "learning_rate": 4.80650439097356e-05, | |
| "loss": 1.1909, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 1.0433249473571777, | |
| "learning_rate": 4.805891865210006e-05, | |
| "loss": 1.1868, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.0152, | |
| "grad_norm": 0.9942545294761658, | |
| "learning_rate": 4.8052784106334854e-05, | |
| "loss": 1.1896, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 0.0154, | |
| "grad_norm": 0.9021309018135071, | |
| "learning_rate": 4.804664027491096e-05, | |
| "loss": 1.2265, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 0.0156, | |
| "grad_norm": 1.4818402528762817, | |
| "learning_rate": 4.8040487160303126e-05, | |
| "loss": 1.2149, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 0.0158, | |
| "grad_norm": 0.74870365858078, | |
| "learning_rate": 4.803432476498981e-05, | |
| "loss": 1.1928, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.7827754020690918, | |
| "learning_rate": 4.8028153091453246e-05, | |
| "loss": 1.2062, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 1.0748348236083984, | |
| "eval_runtime": 75.9274, | |
| "eval_samples_per_second": 203.392, | |
| "eval_steps_per_second": 3.187, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.0162, | |
| "grad_norm": 1.2250913381576538, | |
| "learning_rate": 4.802197214217936e-05, | |
| "loss": 1.1412, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 0.0164, | |
| "grad_norm": 1.4763202667236328, | |
| "learning_rate": 4.801578191965785e-05, | |
| "loss": 1.173, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 0.0166, | |
| "grad_norm": 0.8980317115783691, | |
| "learning_rate": 4.800958242638214e-05, | |
| "loss": 1.1801, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 0.0168, | |
| "grad_norm": 1.2781926393508911, | |
| "learning_rate": 4.800337366484937e-05, | |
| "loss": 1.2012, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.8269230723381042, | |
| "learning_rate": 4.799715563756045e-05, | |
| "loss": 1.2319, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.0172, | |
| "grad_norm": 0.633537232875824, | |
| "learning_rate": 4.7990928347019984e-05, | |
| "loss": 1.2058, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 0.0174, | |
| "grad_norm": 1.39164400100708, | |
| "learning_rate": 4.7984691795736324e-05, | |
| "loss": 1.2066, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 1.5555399656295776, | |
| "learning_rate": 4.7978445986221566e-05, | |
| "loss": 1.2088, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 0.0178, | |
| "grad_norm": 1.2505526542663574, | |
| "learning_rate": 4.7972190920991514e-05, | |
| "loss": 1.203, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 1.5910965204238892, | |
| "learning_rate": 4.7965926602565705e-05, | |
| "loss": 1.1877, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 1.0717748403549194, | |
| "eval_runtime": 75.7519, | |
| "eval_samples_per_second": 203.863, | |
| "eval_steps_per_second": 3.195, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.0182, | |
| "grad_norm": 0.7755507826805115, | |
| "learning_rate": 4.79596530334674e-05, | |
| "loss": 1.1864, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 0.0184, | |
| "grad_norm": 1.2141857147216797, | |
| "learning_rate": 4.79533702162236e-05, | |
| "loss": 1.1849, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 0.0186, | |
| "grad_norm": 1.399149775505066, | |
| "learning_rate": 4.794707815336503e-05, | |
| "loss": 1.1702, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 0.0188, | |
| "grad_norm": 1.3381379842758179, | |
| "learning_rate": 4.7940776847426114e-05, | |
| "loss": 1.2052, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 1.347264051437378, | |
| "learning_rate": 4.793446630094503e-05, | |
| "loss": 1.1998, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 1.2072675228118896, | |
| "learning_rate": 4.792814651646367e-05, | |
| "loss": 1.2127, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 0.0194, | |
| "grad_norm": 0.7959086894989014, | |
| "learning_rate": 4.792181749652763e-05, | |
| "loss": 1.1474, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 0.0196, | |
| "grad_norm": 1.0272786617279053, | |
| "learning_rate": 4.7915479243686244e-05, | |
| "loss": 1.2033, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 0.0198, | |
| "grad_norm": 0.8985835909843445, | |
| "learning_rate": 4.790913176049256e-05, | |
| "loss": 1.1942, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.676177442073822, | |
| "learning_rate": 4.7902775049503346e-05, | |
| "loss": 1.1883, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 1.0733578205108643, | |
| "eval_runtime": 75.8186, | |
| "eval_samples_per_second": 203.684, | |
| "eval_steps_per_second": 3.192, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.0202, | |
| "grad_norm": 0.7747570872306824, | |
| "learning_rate": 4.789640911327907e-05, | |
| "loss": 1.1883, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 0.0204, | |
| "grad_norm": 1.1808815002441406, | |
| "learning_rate": 4.789003395438395e-05, | |
| "loss": 1.1932, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 0.0206, | |
| "grad_norm": 1.29102623462677, | |
| "learning_rate": 4.7883649575385894e-05, | |
| "loss": 1.1654, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 0.8418052792549133, | |
| "learning_rate": 4.7877255978856516e-05, | |
| "loss": 1.1702, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.1825124025344849, | |
| "learning_rate": 4.787085316737116e-05, | |
| "loss": 1.1654, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.0212, | |
| "grad_norm": 1.301255702972412, | |
| "learning_rate": 4.78644411435089e-05, | |
| "loss": 1.2505, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 0.0214, | |
| "grad_norm": 1.2461885213851929, | |
| "learning_rate": 4.785801990985247e-05, | |
| "loss": 1.1907, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 0.0216, | |
| "grad_norm": 1.2957687377929688, | |
| "learning_rate": 4.7851589468988364e-05, | |
| "loss": 1.2244, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 0.0218, | |
| "grad_norm": 1.9566733837127686, | |
| "learning_rate": 4.7845149823506744e-05, | |
| "loss": 1.1688, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.9749345779418945, | |
| "learning_rate": 4.783870097600151e-05, | |
| "loss": 1.2178, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 1.076163649559021, | |
| "eval_runtime": 75.78, | |
| "eval_samples_per_second": 203.787, | |
| "eval_steps_per_second": 3.193, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.0222, | |
| "grad_norm": 1.1278064250946045, | |
| "learning_rate": 4.783224292907025e-05, | |
| "loss": 1.1899, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 1.023586392402649, | |
| "learning_rate": 4.7825775685314277e-05, | |
| "loss": 1.1967, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 0.0226, | |
| "grad_norm": 1.2925764322280884, | |
| "learning_rate": 4.781929924733858e-05, | |
| "loss": 1.2154, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 0.0228, | |
| "grad_norm": 0.8185212016105652, | |
| "learning_rate": 4.781281361775188e-05, | |
| "loss": 1.195, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 0.8742319941520691, | |
| "learning_rate": 4.7806318799166586e-05, | |
| "loss": 1.1746, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.0232, | |
| "grad_norm": 1.2598085403442383, | |
| "learning_rate": 4.77998147941988e-05, | |
| "loss": 1.1781, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 0.0234, | |
| "grad_norm": 1.2358424663543701, | |
| "learning_rate": 4.7793301605468344e-05, | |
| "loss": 1.2345, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 0.0236, | |
| "grad_norm": 1.2528828382492065, | |
| "learning_rate": 4.778677923559872e-05, | |
| "loss": 1.2109, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 0.0238, | |
| "grad_norm": 0.5741105675697327, | |
| "learning_rate": 4.778024768721716e-05, | |
| "loss": 1.2076, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.3200185298919678, | |
| "learning_rate": 4.7773706962954545e-05, | |
| "loss": 1.2124, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 1.0720120668411255, | |
| "eval_runtime": 76.4471, | |
| "eval_samples_per_second": 202.009, | |
| "eval_steps_per_second": 3.166, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.0242, | |
| "grad_norm": 1.4096635580062866, | |
| "learning_rate": 4.776715706544549e-05, | |
| "loss": 1.2283, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 0.0244, | |
| "grad_norm": 1.5862853527069092, | |
| "learning_rate": 4.7760597997328295e-05, | |
| "loss": 1.1927, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 0.0246, | |
| "grad_norm": 1.3406593799591064, | |
| "learning_rate": 4.7754029761244945e-05, | |
| "loss": 1.1709, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 0.0248, | |
| "grad_norm": 1.189676284790039, | |
| "learning_rate": 4.774745235984113e-05, | |
| "loss": 1.2176, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.4424960613250732, | |
| "learning_rate": 4.7740865795766224e-05, | |
| "loss": 1.2212, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.0252, | |
| "grad_norm": 0.7654275298118591, | |
| "learning_rate": 4.77342700716733e-05, | |
| "loss": 1.2196, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 0.0254, | |
| "grad_norm": 1.1389504671096802, | |
| "learning_rate": 4.772766519021911e-05, | |
| "loss": 1.1937, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.1204986572265625, | |
| "learning_rate": 4.772105115406409e-05, | |
| "loss": 1.1623, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 0.0258, | |
| "grad_norm": 1.2594044208526611, | |
| "learning_rate": 4.771442796587239e-05, | |
| "loss": 1.2127, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 1.3245586156845093, | |
| "learning_rate": 4.770779562831181e-05, | |
| "loss": 1.1919, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 1.0672369003295898, | |
| "eval_runtime": 76.1554, | |
| "eval_samples_per_second": 202.783, | |
| "eval_steps_per_second": 3.178, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.0262, | |
| "grad_norm": 0.813410222530365, | |
| "learning_rate": 4.770115414405388e-05, | |
| "loss": 1.224, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 0.0264, | |
| "grad_norm": 1.3278921842575073, | |
| "learning_rate": 4.769450351577377e-05, | |
| "loss": 1.2304, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 0.0266, | |
| "grad_norm": 1.1676868200302124, | |
| "learning_rate": 4.768784374615036e-05, | |
| "loss": 1.2144, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 0.0268, | |
| "grad_norm": 1.2270694971084595, | |
| "learning_rate": 4.7681174837866196e-05, | |
| "loss": 1.2584, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 1.5095762014389038, | |
| "learning_rate": 4.7674496793607525e-05, | |
| "loss": 1.1892, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 1.0437262058258057, | |
| "learning_rate": 4.766780961606426e-05, | |
| "loss": 1.2003, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 0.0274, | |
| "grad_norm": 0.6719204187393188, | |
| "learning_rate": 4.766111330793e-05, | |
| "loss": 1.2145, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 0.0276, | |
| "grad_norm": 0.7166513204574585, | |
| "learning_rate": 4.765440787190199e-05, | |
| "loss": 1.2463, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 0.0278, | |
| "grad_norm": 0.9765319228172302, | |
| "learning_rate": 4.7647693310681204e-05, | |
| "loss": 1.2095, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 1.298134446144104, | |
| "learning_rate": 4.7640969626972265e-05, | |
| "loss": 1.2089, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 1.0727263689041138, | |
| "eval_runtime": 76.0016, | |
| "eval_samples_per_second": 203.193, | |
| "eval_steps_per_second": 3.184, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.0282, | |
| "grad_norm": 1.1968761682510376, | |
| "learning_rate": 4.763423682348347e-05, | |
| "loss": 1.1719, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 0.0284, | |
| "grad_norm": 1.1887174844741821, | |
| "learning_rate": 4.762749490292678e-05, | |
| "loss": 1.1961, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 0.0286, | |
| "grad_norm": 1.4029371738433838, | |
| "learning_rate": 4.762074386801786e-05, | |
| "loss": 1.1609, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 1.3785122632980347, | |
| "learning_rate": 4.761398372147601e-05, | |
| "loss": 1.1931, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 1.1329565048217773, | |
| "learning_rate": 4.760721446602422e-05, | |
| "loss": 1.2107, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.0292, | |
| "grad_norm": 1.2266113758087158, | |
| "learning_rate": 4.760043610438915e-05, | |
| "loss": 1.1708, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 0.0294, | |
| "grad_norm": 1.2526196241378784, | |
| "learning_rate": 4.759364863930112e-05, | |
| "loss": 1.2073, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 0.0296, | |
| "grad_norm": 1.3959336280822754, | |
| "learning_rate": 4.7586852073494113e-05, | |
| "loss": 1.1995, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 0.0298, | |
| "grad_norm": 1.2470852136611938, | |
| "learning_rate": 4.7580046409705806e-05, | |
| "loss": 1.2227, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0915220975875854, | |
| "learning_rate": 4.7573231650677495e-05, | |
| "loss": 1.1955, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 1.0732471942901611, | |
| "eval_runtime": 75.8455, | |
| "eval_samples_per_second": 203.611, | |
| "eval_steps_per_second": 3.191, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.0302, | |
| "grad_norm": 1.4608689546585083, | |
| "learning_rate": 4.756640779915418e-05, | |
| "loss": 1.1588, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 1.2811450958251953, | |
| "learning_rate": 4.755957485788449e-05, | |
| "loss": 1.1722, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 0.0306, | |
| "grad_norm": 1.3260635137557983, | |
| "learning_rate": 4.755273282962075e-05, | |
| "loss": 1.2238, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 0.0308, | |
| "grad_norm": 1.219567060470581, | |
| "learning_rate": 4.754588171711893e-05, | |
| "loss": 1.2718, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 1.368947982788086, | |
| "learning_rate": 4.753902152313865e-05, | |
| "loss": 1.1998, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.0312, | |
| "grad_norm": 1.3364487886428833, | |
| "learning_rate": 4.7532152250443194e-05, | |
| "loss": 1.2043, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 0.0314, | |
| "grad_norm": 1.348130702972412, | |
| "learning_rate": 4.7525273901799506e-05, | |
| "loss": 1.1834, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 0.0316, | |
| "grad_norm": 1.1862202882766724, | |
| "learning_rate": 4.751838647997818e-05, | |
| "loss": 1.2061, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 0.0318, | |
| "grad_norm": 0.7471460103988647, | |
| "learning_rate": 4.7511489987753476e-05, | |
| "loss": 1.1866, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.4090434312820435, | |
| "learning_rate": 4.75045844279033e-05, | |
| "loss": 1.1878, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 1.0745600461959839, | |
| "eval_runtime": 76.306, | |
| "eval_samples_per_second": 202.382, | |
| "eval_steps_per_second": 3.171, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.0322, | |
| "grad_norm": 1.591199278831482, | |
| "learning_rate": 4.7497669803209204e-05, | |
| "loss": 1.1824, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 0.0324, | |
| "grad_norm": 0.8325656652450562, | |
| "learning_rate": 4.749074611645641e-05, | |
| "loss": 1.1723, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 0.0326, | |
| "grad_norm": 0.8313425779342651, | |
| "learning_rate": 4.748381337043376e-05, | |
| "loss": 1.2033, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 0.0328, | |
| "grad_norm": 1.4721826314926147, | |
| "learning_rate": 4.7476871567933775e-05, | |
| "loss": 1.1988, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.9206506013870239, | |
| "learning_rate": 4.746992071175261e-05, | |
| "loss": 1.1844, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.0332, | |
| "grad_norm": 1.0820422172546387, | |
| "learning_rate": 4.746296080469007e-05, | |
| "loss": 1.1902, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 0.0334, | |
| "grad_norm": 0.9319769144058228, | |
| "learning_rate": 4.745599184954961e-05, | |
| "loss": 1.2031, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 1.1914819478988647, | |
| "learning_rate": 4.744901384913831e-05, | |
| "loss": 1.166, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 0.0338, | |
| "grad_norm": 0.8440219163894653, | |
| "learning_rate": 4.7442026806266914e-05, | |
| "loss": 1.1493, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 1.001518726348877, | |
| "learning_rate": 4.7435030723749813e-05, | |
| "loss": 1.1835, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 1.0681182146072388, | |
| "eval_runtime": 76.1301, | |
| "eval_samples_per_second": 202.85, | |
| "eval_steps_per_second": 3.179, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.0342, | |
| "grad_norm": 1.347307562828064, | |
| "learning_rate": 4.742802560440501e-05, | |
| "loss": 1.2213, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 0.0344, | |
| "grad_norm": 1.1187894344329834, | |
| "learning_rate": 4.742101145105419e-05, | |
| "loss": 1.1949, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 0.0346, | |
| "grad_norm": 0.8066337704658508, | |
| "learning_rate": 4.741398826652262e-05, | |
| "loss": 1.2008, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 0.0348, | |
| "grad_norm": 1.0704104900360107, | |
| "learning_rate": 4.740695605363927e-05, | |
| "loss": 1.1804, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 1.104546070098877, | |
| "learning_rate": 4.7399914815236704e-05, | |
| "loss": 1.2232, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.1818023920059204, | |
| "learning_rate": 4.7392864554151126e-05, | |
| "loss": 1.2062, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 0.0354, | |
| "grad_norm": 1.3036936521530151, | |
| "learning_rate": 4.738580527322238e-05, | |
| "loss": 1.1905, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 0.0356, | |
| "grad_norm": 1.1169214248657227, | |
| "learning_rate": 4.737873697529395e-05, | |
| "loss": 1.1759, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 0.0358, | |
| "grad_norm": 0.8993995189666748, | |
| "learning_rate": 4.7371659663212934e-05, | |
| "loss": 1.1677, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 1.258747935295105, | |
| "learning_rate": 4.736457333983009e-05, | |
| "loss": 1.2166, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 1.0701075792312622, | |
| "eval_runtime": 75.9209, | |
| "eval_samples_per_second": 203.409, | |
| "eval_steps_per_second": 3.188, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.0362, | |
| "grad_norm": 1.269551396369934, | |
| "learning_rate": 4.735747800799978e-05, | |
| "loss": 1.2185, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 0.0364, | |
| "grad_norm": 1.3016119003295898, | |
| "learning_rate": 4.735037367057999e-05, | |
| "loss": 1.182, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 0.0366, | |
| "grad_norm": 1.1407994031906128, | |
| "learning_rate": 4.734326033043238e-05, | |
| "loss": 1.2102, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 1.1673243045806885, | |
| "learning_rate": 4.7336137990422164e-05, | |
| "loss": 1.1902, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.9958565831184387, | |
| "learning_rate": 4.732900665341824e-05, | |
| "loss": 1.2112, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.0372, | |
| "grad_norm": 0.6769017577171326, | |
| "learning_rate": 4.732186632229311e-05, | |
| "loss": 1.1933, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 0.0374, | |
| "grad_norm": 0.6430754661560059, | |
| "learning_rate": 4.7314716999922896e-05, | |
| "loss": 1.1851, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 0.0376, | |
| "grad_norm": 1.103901982307434, | |
| "learning_rate": 4.7307558689187334e-05, | |
| "loss": 1.2234, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 0.0378, | |
| "grad_norm": 1.143268346786499, | |
| "learning_rate": 4.73003913929698e-05, | |
| "loss": 1.1609, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 1.2543673515319824, | |
| "learning_rate": 4.7293215114157284e-05, | |
| "loss": 1.1862, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 1.075058937072754, | |
| "eval_runtime": 77.0151, | |
| "eval_samples_per_second": 200.519, | |
| "eval_steps_per_second": 3.142, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.0382, | |
| "grad_norm": 1.0687370300292969, | |
| "learning_rate": 4.728602985564039e-05, | |
| "loss": 1.1878, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.230892539024353, | |
| "learning_rate": 4.727883562031333e-05, | |
| "loss": 1.1561, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 0.0386, | |
| "grad_norm": 1.0465742349624634, | |
| "learning_rate": 4.727163241107395e-05, | |
| "loss": 1.1677, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 0.0388, | |
| "grad_norm": 0.6553373336791992, | |
| "learning_rate": 4.726442023082369e-05, | |
| "loss": 1.2035, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.9347487688064575, | |
| "learning_rate": 4.725719908246763e-05, | |
| "loss": 1.2116, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.0392, | |
| "grad_norm": 1.0414602756500244, | |
| "learning_rate": 4.724996896891445e-05, | |
| "loss": 1.2237, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 0.0394, | |
| "grad_norm": 1.1857577562332153, | |
| "learning_rate": 4.724272989307642e-05, | |
| "loss": 1.1653, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 0.0396, | |
| "grad_norm": 1.3574703931808472, | |
| "learning_rate": 4.7235481857869446e-05, | |
| "loss": 1.2176, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 0.0398, | |
| "grad_norm": 1.3188464641571045, | |
| "learning_rate": 4.722822486621304e-05, | |
| "loss": 1.1872, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.1241661310195923, | |
| "learning_rate": 4.722095892103032e-05, | |
| "loss": 1.1926, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 1.0716365575790405, | |
| "eval_runtime": 76.5906, | |
| "eval_samples_per_second": 201.63, | |
| "eval_steps_per_second": 3.16, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.0402, | |
| "grad_norm": 0.9855309724807739, | |
| "learning_rate": 4.721368402524801e-05, | |
| "loss": 1.1427, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 0.0404, | |
| "grad_norm": 0.6458451151847839, | |
| "learning_rate": 4.720640018179642e-05, | |
| "loss": 1.2032, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 0.0406, | |
| "grad_norm": 1.1878600120544434, | |
| "learning_rate": 4.71991073936095e-05, | |
| "loss": 1.1879, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 0.0408, | |
| "grad_norm": 0.8349748253822327, | |
| "learning_rate": 4.719180566362477e-05, | |
| "loss": 1.1355, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 1.1937662363052368, | |
| "learning_rate": 4.7184494994783376e-05, | |
| "loss": 1.2018, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.0412, | |
| "grad_norm": 1.3011997938156128, | |
| "learning_rate": 4.7177175390030054e-05, | |
| "loss": 1.1697, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 0.0414, | |
| "grad_norm": 1.1909871101379395, | |
| "learning_rate": 4.7169846852313137e-05, | |
| "loss": 1.2126, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.5078299045562744, | |
| "learning_rate": 4.7162509384584555e-05, | |
| "loss": 1.1983, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 0.0418, | |
| "grad_norm": 1.3141160011291504, | |
| "learning_rate": 4.715516298979984e-05, | |
| "loss": 1.2118, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 1.3565207719802856, | |
| "learning_rate": 4.714780767091813e-05, | |
| "loss": 1.2054, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 1.0669591426849365, | |
| "eval_runtime": 75.959, | |
| "eval_samples_per_second": 203.307, | |
| "eval_steps_per_second": 3.186, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.0422, | |
| "grad_norm": 1.3890715837478638, | |
| "learning_rate": 4.714044343090214e-05, | |
| "loss": 1.1917, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 0.0424, | |
| "grad_norm": 0.9992968440055847, | |
| "learning_rate": 4.713307027271817e-05, | |
| "loss": 1.1869, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 0.0426, | |
| "grad_norm": 0.8716880679130554, | |
| "learning_rate": 4.712568819933615e-05, | |
| "loss": 1.1941, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 0.0428, | |
| "grad_norm": 1.243594765663147, | |
| "learning_rate": 4.711829721372957e-05, | |
| "loss": 1.1667, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.6567416191101074, | |
| "learning_rate": 4.7110897318875516e-05, | |
| "loss": 1.2105, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 0.5886017084121704, | |
| "learning_rate": 4.710348851775467e-05, | |
| "loss": 1.1867, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 0.0434, | |
| "grad_norm": 0.6296970248222351, | |
| "learning_rate": 4.709607081335129e-05, | |
| "loss": 1.1702, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 0.0436, | |
| "grad_norm": 0.9896938800811768, | |
| "learning_rate": 4.7088644208653226e-05, | |
| "loss": 1.1628, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 0.0438, | |
| "grad_norm": 0.7199723720550537, | |
| "learning_rate": 4.708120870665192e-05, | |
| "loss": 1.1792, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 1.3148512840270996, | |
| "learning_rate": 4.707376431034238e-05, | |
| "loss": 1.185, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 1.0709099769592285, | |
| "eval_runtime": 75.8635, | |
| "eval_samples_per_second": 203.563, | |
| "eval_steps_per_second": 3.19, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.0442, | |
| "grad_norm": 0.6634069681167603, | |
| "learning_rate": 4.706631102272323e-05, | |
| "loss": 1.1633, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 0.0444, | |
| "grad_norm": 1.3700015544891357, | |
| "learning_rate": 4.705884884679663e-05, | |
| "loss": 1.1712, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 0.0446, | |
| "grad_norm": 1.1697111129760742, | |
| "learning_rate": 4.705137778556835e-05, | |
| "loss": 1.1902, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.4012552499771118, | |
| "learning_rate": 4.7043897842047735e-05, | |
| "loss": 1.216, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 1.2128801345825195, | |
| "learning_rate": 4.7036409019247706e-05, | |
| "loss": 1.2169, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.0452, | |
| "grad_norm": 1.435831904411316, | |
| "learning_rate": 4.7028911320184766e-05, | |
| "loss": 1.1839, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 0.0454, | |
| "grad_norm": 0.8126788139343262, | |
| "learning_rate": 4.702140474787898e-05, | |
| "loss": 1.1652, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 0.0456, | |
| "grad_norm": 1.1946730613708496, | |
| "learning_rate": 4.7013889305353985e-05, | |
| "loss": 1.2277, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 0.0458, | |
| "grad_norm": 0.6007882952690125, | |
| "learning_rate": 4.700636499563702e-05, | |
| "loss": 1.1586, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.6656979322433472, | |
| "learning_rate": 4.699883182175886e-05, | |
| "loss": 1.1902, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 1.072899580001831, | |
| "eval_runtime": 77.2342, | |
| "eval_samples_per_second": 199.95, | |
| "eval_steps_per_second": 3.133, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.0462, | |
| "grad_norm": 1.5463351011276245, | |
| "learning_rate": 4.6991289786753876e-05, | |
| "loss": 1.1988, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 1.202536940574646, | |
| "learning_rate": 4.698373889366e-05, | |
| "loss": 1.1983, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 0.0466, | |
| "grad_norm": 0.7186087369918823, | |
| "learning_rate": 4.6976179145518724e-05, | |
| "loss": 1.15, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 0.0468, | |
| "grad_norm": 1.3059759140014648, | |
| "learning_rate": 4.6968610545375116e-05, | |
| "loss": 1.1896, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.8425590991973877, | |
| "learning_rate": 4.696103309627781e-05, | |
| "loss": 1.1747, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.0472, | |
| "grad_norm": 1.1745330095291138, | |
| "learning_rate": 4.695344680127899e-05, | |
| "loss": 1.1591, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 0.0474, | |
| "grad_norm": 0.6429449915885925, | |
| "learning_rate": 4.694585166343443e-05, | |
| "loss": 1.1893, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 0.0476, | |
| "grad_norm": 1.5323892831802368, | |
| "learning_rate": 4.693824768580344e-05, | |
| "loss": 1.2037, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 0.0478, | |
| "grad_norm": 1.2719688415527344, | |
| "learning_rate": 4.693063487144891e-05, | |
| "loss": 1.191, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.1735507249832153, | |
| "learning_rate": 4.6923013223437276e-05, | |
| "loss": 1.1904, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 1.0721956491470337, | |
| "eval_runtime": 76.3531, | |
| "eval_samples_per_second": 202.258, | |
| "eval_steps_per_second": 3.169, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.0482, | |
| "grad_norm": 1.1949045658111572, | |
| "learning_rate": 4.6915382744838536e-05, | |
| "loss": 1.1507, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 0.0484, | |
| "grad_norm": 1.074385404586792, | |
| "learning_rate": 4.690774343872625e-05, | |
| "loss": 1.1504, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 0.0486, | |
| "grad_norm": 1.0720473527908325, | |
| "learning_rate": 4.690009530817753e-05, | |
| "loss": 1.1758, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 0.0488, | |
| "grad_norm": 1.0596733093261719, | |
| "learning_rate": 4.6892438356273024e-05, | |
| "loss": 1.1778, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 1.2753647565841675, | |
| "learning_rate": 4.688477258609698e-05, | |
| "loss": 1.1827, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.0492, | |
| "grad_norm": 1.2803727388381958, | |
| "learning_rate": 4.687709800073715e-05, | |
| "loss": 1.164, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 0.0494, | |
| "grad_norm": 1.4797301292419434, | |
| "learning_rate": 4.6869414603284865e-05, | |
| "loss": 1.1748, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 1.1455540657043457, | |
| "learning_rate": 4.6861722396834996e-05, | |
| "loss": 1.1918, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 0.0498, | |
| "grad_norm": 1.1636658906936646, | |
| "learning_rate": 4.6854021384485954e-05, | |
| "loss": 1.208, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.267817735671997, | |
| "learning_rate": 4.684631156933971e-05, | |
| "loss": 1.1679, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 1.0709259510040283, | |
| "eval_runtime": 76.3729, | |
| "eval_samples_per_second": 202.205, | |
| "eval_steps_per_second": 3.169, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.0502, | |
| "grad_norm": 1.5029271841049194, | |
| "learning_rate": 4.683859295450178e-05, | |
| "loss": 1.1459, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 0.0504, | |
| "grad_norm": 0.7328454256057739, | |
| "learning_rate": 4.683086554308123e-05, | |
| "loss": 1.1861, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 0.0506, | |
| "grad_norm": 1.114625334739685, | |
| "learning_rate": 4.682312933819063e-05, | |
| "loss": 1.1609, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 0.0508, | |
| "grad_norm": 1.4052484035491943, | |
| "learning_rate": 4.681538434294615e-05, | |
| "loss": 1.1534, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.7364799976348877, | |
| "learning_rate": 4.6807630560467475e-05, | |
| "loss": 1.1973, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.701787531375885, | |
| "learning_rate": 4.679986799387781e-05, | |
| "loss": 1.1743, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 0.0514, | |
| "grad_norm": 1.331763744354248, | |
| "learning_rate": 4.679209664630393e-05, | |
| "loss": 1.1516, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 0.0516, | |
| "grad_norm": 0.9733197689056396, | |
| "learning_rate": 4.6784316520876124e-05, | |
| "loss": 1.1646, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 0.0518, | |
| "grad_norm": 0.7415294051170349, | |
| "learning_rate": 4.677652762072823e-05, | |
| "loss": 1.2005, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 1.1027395725250244, | |
| "learning_rate": 4.6768729948997606e-05, | |
| "loss": 1.1601, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 1.0681675672531128, | |
| "eval_runtime": 76.2441, | |
| "eval_samples_per_second": 202.547, | |
| "eval_steps_per_second": 3.174, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.0522, | |
| "grad_norm": 0.7156331539154053, | |
| "learning_rate": 4.676092350882517e-05, | |
| "loss": 1.1854, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 0.0524, | |
| "grad_norm": 1.3423713445663452, | |
| "learning_rate": 4.675310830335534e-05, | |
| "loss": 1.2135, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 0.0526, | |
| "grad_norm": 1.1925442218780518, | |
| "learning_rate": 4.6745284335736095e-05, | |
| "loss": 1.1414, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 1.1717417240142822, | |
| "learning_rate": 4.673745160911891e-05, | |
| "loss": 1.184, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 1.0722715854644775, | |
| "learning_rate": 4.672961012665883e-05, | |
| "loss": 1.1685, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.0532, | |
| "grad_norm": 1.293058156967163, | |
| "learning_rate": 4.6721759891514386e-05, | |
| "loss": 1.1639, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 0.0534, | |
| "grad_norm": 1.1121761798858643, | |
| "learning_rate": 4.671390090684765e-05, | |
| "loss": 1.1693, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 0.0536, | |
| "grad_norm": 1.1979039907455444, | |
| "learning_rate": 4.6706033175824226e-05, | |
| "loss": 1.2123, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 0.0538, | |
| "grad_norm": 1.3003602027893066, | |
| "learning_rate": 4.669815670161324e-05, | |
| "loss": 1.1529, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.627068817615509, | |
| "learning_rate": 4.669027148738732e-05, | |
| "loss": 1.1901, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 1.0730499029159546, | |
| "eval_runtime": 76.271, | |
| "eval_samples_per_second": 202.475, | |
| "eval_steps_per_second": 3.173, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.0542, | |
| "grad_norm": 1.0153006315231323, | |
| "learning_rate": 4.6682377536322644e-05, | |
| "loss": 1.1295, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 1.3619033098220825, | |
| "learning_rate": 4.667447485159889e-05, | |
| "loss": 1.1759, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 0.0546, | |
| "grad_norm": 0.8665468692779541, | |
| "learning_rate": 4.666656343639926e-05, | |
| "loss": 1.1602, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 0.0548, | |
| "grad_norm": 0.7338219285011292, | |
| "learning_rate": 4.665864329391046e-05, | |
| "loss": 1.1856, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.7363407611846924, | |
| "learning_rate": 4.665071442732274e-05, | |
| "loss": 1.1629, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.0552, | |
| "grad_norm": 0.9836055636405945, | |
| "learning_rate": 4.664277683982984e-05, | |
| "loss": 1.1755, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 0.0554, | |
| "grad_norm": 1.0638995170593262, | |
| "learning_rate": 4.663483053462901e-05, | |
| "loss": 1.186, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 0.0556, | |
| "grad_norm": 0.9050219058990479, | |
| "learning_rate": 4.662687551492103e-05, | |
| "loss": 1.2357, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 0.0558, | |
| "grad_norm": 0.917178213596344, | |
| "learning_rate": 4.661891178391018e-05, | |
| "loss": 1.1573, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 1.2023630142211914, | |
| "learning_rate": 4.661093934480425e-05, | |
| "loss": 1.1795, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 1.0689297914505005, | |
| "eval_runtime": 77.0471, | |
| "eval_samples_per_second": 200.436, | |
| "eval_steps_per_second": 3.141, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.0562, | |
| "grad_norm": 1.2633955478668213, | |
| "learning_rate": 4.660295820081453e-05, | |
| "loss": 1.1501, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 0.0564, | |
| "grad_norm": 0.5867215991020203, | |
| "learning_rate": 4.6594968355155835e-05, | |
| "loss": 1.2096, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 0.0566, | |
| "grad_norm": 1.3425019979476929, | |
| "learning_rate": 4.658696981104646e-05, | |
| "loss": 1.2016, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 0.0568, | |
| "grad_norm": 0.8101886510848999, | |
| "learning_rate": 4.657896257170825e-05, | |
| "loss": 1.1512, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 1.43784761428833, | |
| "learning_rate": 4.6570946640366474e-05, | |
| "loss": 1.1536, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.0572, | |
| "grad_norm": 0.766494870185852, | |
| "learning_rate": 4.6562922020249984e-05, | |
| "loss": 1.1521, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 0.0574, | |
| "grad_norm": 1.5485390424728394, | |
| "learning_rate": 4.6554888714591076e-05, | |
| "loss": 1.176, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 0.8266467452049255, | |
| "learning_rate": 4.654684672662557e-05, | |
| "loss": 1.1514, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 0.0578, | |
| "grad_norm": 1.2086583375930786, | |
| "learning_rate": 4.6538796059592784e-05, | |
| "loss": 1.177, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 1.4609780311584473, | |
| "learning_rate": 4.6530736716735526e-05, | |
| "loss": 1.1447, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 1.0664150714874268, | |
| "eval_runtime": 76.3509, | |
| "eval_samples_per_second": 202.264, | |
| "eval_steps_per_second": 3.17, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.0582, | |
| "grad_norm": 1.0640435218811035, | |
| "learning_rate": 4.652266870130008e-05, | |
| "loss": 1.1392, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 0.0584, | |
| "grad_norm": 1.3286436796188354, | |
| "learning_rate": 4.651459201653626e-05, | |
| "loss": 1.222, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 0.0586, | |
| "grad_norm": 0.7577000260353088, | |
| "learning_rate": 4.650650666569736e-05, | |
| "loss": 1.1842, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 0.0588, | |
| "grad_norm": 1.0623698234558105, | |
| "learning_rate": 4.6498412652040137e-05, | |
| "loss": 1.2071, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.9597827792167664, | |
| "learning_rate": 4.6490309978824866e-05, | |
| "loss": 1.1781, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 1.126639485359192, | |
| "learning_rate": 4.6482198649315306e-05, | |
| "loss": 1.1897, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 0.0594, | |
| "grad_norm": 1.1724388599395752, | |
| "learning_rate": 4.64740786667787e-05, | |
| "loss": 1.1567, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 0.0596, | |
| "grad_norm": 1.14126718044281, | |
| "learning_rate": 4.6465950034485776e-05, | |
| "loss": 1.1819, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 0.0598, | |
| "grad_norm": 0.8016234040260315, | |
| "learning_rate": 4.645781275571075e-05, | |
| "loss": 1.1906, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.3095015287399292, | |
| "learning_rate": 4.644966683373131e-05, | |
| "loss": 1.1976, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 1.0730445384979248, | |
| "eval_runtime": 76.1401, | |
| "eval_samples_per_second": 202.823, | |
| "eval_steps_per_second": 3.178, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.0602, | |
| "grad_norm": 0.5794508457183838, | |
| "learning_rate": 4.6441512271828626e-05, | |
| "loss": 1.1478, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 0.0604, | |
| "grad_norm": 0.9965047240257263, | |
| "learning_rate": 4.6433349073287366e-05, | |
| "loss": 1.201, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 0.0606, | |
| "grad_norm": 1.280166506767273, | |
| "learning_rate": 4.642517724139567e-05, | |
| "loss": 1.1542, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 0.7828945517539978, | |
| "learning_rate": 4.641699677944514e-05, | |
| "loss": 1.186, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 1.096155047416687, | |
| "learning_rate": 4.640880769073087e-05, | |
| "loss": 1.1969, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.0612, | |
| "grad_norm": 0.7447170615196228, | |
| "learning_rate": 4.6400609978551416e-05, | |
| "loss": 1.1482, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 0.0614, | |
| "grad_norm": 0.8162779808044434, | |
| "learning_rate": 4.639240364620882e-05, | |
| "loss": 1.2072, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 0.0616, | |
| "grad_norm": 1.2612018585205078, | |
| "learning_rate": 4.638418869700861e-05, | |
| "loss": 1.1402, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 0.0618, | |
| "grad_norm": 0.8543398380279541, | |
| "learning_rate": 4.637596513425974e-05, | |
| "loss": 1.1718, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 1.2375905513763428, | |
| "learning_rate": 4.636773296127467e-05, | |
| "loss": 1.1587, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "eval_loss": 1.0713858604431152, | |
| "eval_runtime": 76.3385, | |
| "eval_samples_per_second": 202.296, | |
| "eval_steps_per_second": 3.17, | |
| "step": 91000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 500000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.970894657486848e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |