| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.034, |
| "eval_steps": 1000, |
| "global_step": 17000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2e-06, |
| "grad_norm": 29.506126403808594, |
| "learning_rate": 0.0, |
| "loss": 1.5091, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 7.35781717300415, |
| "learning_rate": 9.9e-07, |
| "loss": 1.6562, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 4.9180989265441895, |
| "learning_rate": 1.99e-06, |
| "loss": 1.6176, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 1.8868086338043213, |
| "learning_rate": 2.99e-06, |
| "loss": 1.548, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 7.365355491638184, |
| "learning_rate": 3.99e-06, |
| "loss": 1.4958, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 8.965476989746094, |
| "learning_rate": 4.9900000000000005e-06, |
| "loss": 1.4918, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 2.2186834812164307, |
| "learning_rate": 5.99e-06, |
| "loss": 1.4807, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 1.970430850982666, |
| "learning_rate": 6.990000000000001e-06, |
| "loss": 1.4312, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 1.5914119482040405, |
| "learning_rate": 7.99e-06, |
| "loss": 1.3848, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 1.7615679502487183, |
| "learning_rate": 8.99e-06, |
| "loss": 1.4126, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 1.5981565713882446, |
| "learning_rate": 9.990000000000001e-06, |
| "loss": 1.3768, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.002, |
| "eval_loss": 1.1488478183746338, |
| "eval_runtime": 84.3931, |
| "eval_samples_per_second": 182.989, |
| "eval_steps_per_second": 2.868, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 1.9463247060775757, |
| "learning_rate": 1.099e-05, |
| "loss": 1.4649, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 1.997353434562683, |
| "learning_rate": 1.199e-05, |
| "loss": 1.422, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0026, |
| "grad_norm": 2.028587818145752, |
| "learning_rate": 1.299e-05, |
| "loss": 1.4101, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 1.8055784702301025, |
| "learning_rate": 1.399e-05, |
| "loss": 1.379, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.630389451980591, |
| "learning_rate": 1.499e-05, |
| "loss": 1.3915, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 1.4471231698989868, |
| "learning_rate": 1.599e-05, |
| "loss": 1.3651, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0034, |
| "grad_norm": 1.4115934371948242, |
| "learning_rate": 1.699e-05, |
| "loss": 1.3327, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 1.1099858283996582, |
| "learning_rate": 1.7990000000000002e-05, |
| "loss": 1.304, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0038, |
| "grad_norm": 1.5767651796340942, |
| "learning_rate": 1.8990000000000003e-05, |
| "loss": 1.3375, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 1.3484268188476562, |
| "learning_rate": 1.999e-05, |
| "loss": 1.3746, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.004, |
| "eval_loss": 1.1486531496047974, |
| "eval_runtime": 76.1223, |
| "eval_samples_per_second": 202.871, |
| "eval_steps_per_second": 3.179, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.0042, |
| "grad_norm": 1.6412079334259033, |
| "learning_rate": 2.099e-05, |
| "loss": 1.3931, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 1.17317533493042, |
| "learning_rate": 2.199e-05, |
| "loss": 1.3512, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.0046, |
| "grad_norm": 0.8342074751853943, |
| "learning_rate": 2.2990000000000002e-05, |
| "loss": 1.3805, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 1.5843234062194824, |
| "learning_rate": 2.3990000000000002e-05, |
| "loss": 1.377, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1.915511131286621, |
| "learning_rate": 2.4990000000000003e-05, |
| "loss": 1.3659, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 1.6507076025009155, |
| "learning_rate": 2.5990000000000004e-05, |
| "loss": 1.2875, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.0054, |
| "grad_norm": 1.5680265426635742, |
| "learning_rate": 2.6989999999999997e-05, |
| "loss": 1.3402, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 0.8005309700965881, |
| "learning_rate": 2.7989999999999998e-05, |
| "loss": 1.3565, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.0058, |
| "grad_norm": 1.664014220237732, |
| "learning_rate": 2.8990000000000002e-05, |
| "loss": 1.3118, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.1597651243209839, |
| "learning_rate": 2.9990000000000003e-05, |
| "loss": 1.3207, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.006, |
| "eval_loss": 1.1344993114471436, |
| "eval_runtime": 76.5771, |
| "eval_samples_per_second": 201.666, |
| "eval_steps_per_second": 3.16, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.0062, |
| "grad_norm": 1.6559661626815796, |
| "learning_rate": 3.099e-05, |
| "loss": 1.3103, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 1.390712857246399, |
| "learning_rate": 3.1990000000000004e-05, |
| "loss": 1.3855, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.0066, |
| "grad_norm": 1.9980418682098389, |
| "learning_rate": 3.299e-05, |
| "loss": 1.3109, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 1.2899682521820068, |
| "learning_rate": 3.399e-05, |
| "loss": 1.3219, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.44901704788208, |
| "learning_rate": 3.499e-05, |
| "loss": 1.3089, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 1.3377976417541504, |
| "learning_rate": 3.599e-05, |
| "loss": 1.2995, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.0074, |
| "grad_norm": 1.5043129920959473, |
| "learning_rate": 3.699e-05, |
| "loss": 1.3421, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 1.4387165307998657, |
| "learning_rate": 3.799e-05, |
| "loss": 1.3337, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.0078, |
| "grad_norm": 1.1607294082641602, |
| "learning_rate": 3.8990000000000004e-05, |
| "loss": 1.2852, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.0189259052276611, |
| "learning_rate": 3.999e-05, |
| "loss": 1.3277, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.008, |
| "eval_loss": 1.1298929452896118, |
| "eval_runtime": 76.4952, |
| "eval_samples_per_second": 201.882, |
| "eval_steps_per_second": 3.164, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.0082, |
| "grad_norm": 1.6229581832885742, |
| "learning_rate": 4.099e-05, |
| "loss": 1.2878, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 1.693702220916748, |
| "learning_rate": 4.199e-05, |
| "loss": 1.313, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.0086, |
| "grad_norm": 1.169730544090271, |
| "learning_rate": 4.299e-05, |
| "loss": 1.2915, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 1.3561712503433228, |
| "learning_rate": 4.3990000000000004e-05, |
| "loss": 1.3337, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1.4713114500045776, |
| "learning_rate": 4.499e-05, |
| "loss": 1.309, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 1.0679044723510742, |
| "learning_rate": 4.599e-05, |
| "loss": 1.3464, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.0094, |
| "grad_norm": 1.4595869779586792, |
| "learning_rate": 4.699e-05, |
| "loss": 1.3385, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 1.6443949937820435, |
| "learning_rate": 4.799e-05, |
| "loss": 1.3287, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.0098, |
| "grad_norm": 1.3524634838104248, |
| "learning_rate": 4.8990000000000004e-05, |
| "loss": 1.3224, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.552986979484558, |
| "learning_rate": 4.999e-05, |
| "loss": 1.3256, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.01, |
| "eval_loss": 1.1314986944198608, |
| "eval_runtime": 76.3433, |
| "eval_samples_per_second": 202.284, |
| "eval_steps_per_second": 3.17, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0102, |
| "grad_norm": 1.1126846075057983, |
| "learning_rate": 4.9999995065197964e-05, |
| "loss": 1.3184, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 0.8533400893211365, |
| "learning_rate": 4.999998006090441e-05, |
| "loss": 1.3145, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.0106, |
| "grad_norm": 1.6032077074050903, |
| "learning_rate": 4.9999954986621866e-05, |
| "loss": 1.2894, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 1.2594430446624756, |
| "learning_rate": 4.999991984236044e-05, |
| "loss": 1.2515, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 1.2169750928878784, |
| "learning_rate": 4.99998746281343e-05, |
| "loss": 1.2603, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 1.2038013935089111, |
| "learning_rate": 4.999981934396165e-05, |
| "loss": 1.3063, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.0114, |
| "grad_norm": 1.1477010250091553, |
| "learning_rate": 4.999975398986476e-05, |
| "loss": 1.3057, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 0.6725754141807556, |
| "learning_rate": 4.9999678565869944e-05, |
| "loss": 1.3211, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.0118, |
| "grad_norm": 1.5470402240753174, |
| "learning_rate": 4.99995930720076e-05, |
| "loss": 1.2794, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 1.8079277276992798, |
| "learning_rate": 4.999949750831215e-05, |
| "loss": 1.2736, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.012, |
| "eval_loss": 1.1335862874984741, |
| "eval_runtime": 76.3508, |
| "eval_samples_per_second": 202.264, |
| "eval_steps_per_second": 3.17, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0122, |
| "grad_norm": 1.4117431640625, |
| "learning_rate": 4.99993918748221e-05, |
| "loss": 1.3142, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.0124, |
| "grad_norm": 1.2657192945480347, |
| "learning_rate": 4.999927617157998e-05, |
| "loss": 1.3216, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.0126, |
| "grad_norm": 1.0358809232711792, |
| "learning_rate": 4.9999150398632425e-05, |
| "loss": 1.329, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 1.6824450492858887, |
| "learning_rate": 4.999901455603007e-05, |
| "loss": 1.2911, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 1.5632168054580688, |
| "learning_rate": 4.9998868643827635e-05, |
| "loss": 1.3004, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.0132, |
| "grad_norm": 1.254310131072998, |
| "learning_rate": 4.99987126620839e-05, |
| "loss": 1.2981, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.0134, |
| "grad_norm": 1.4540060758590698, |
| "learning_rate": 4.999854661086171e-05, |
| "loss": 1.3184, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.0136, |
| "grad_norm": 1.3684179782867432, |
| "learning_rate": 4.999837049022792e-05, |
| "loss": 1.2914, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.0138, |
| "grad_norm": 1.474075436592102, |
| "learning_rate": 4.999818430025349e-05, |
| "loss": 1.2702, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 1.3687875270843506, |
| "learning_rate": 4.999798804101341e-05, |
| "loss": 1.2388, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.014, |
| "eval_loss": 1.1258224248886108, |
| "eval_runtime": 76.3516, |
| "eval_samples_per_second": 202.262, |
| "eval_steps_per_second": 3.17, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.0142, |
| "grad_norm": 0.6668384075164795, |
| "learning_rate": 4.999778171258675e-05, |
| "loss": 1.2768, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.0144, |
| "grad_norm": 1.1303478479385376, |
| "learning_rate": 4.9997565315056596e-05, |
| "loss": 1.2639, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.0146, |
| "grad_norm": 1.516221046447754, |
| "learning_rate": 4.999733884851012e-05, |
| "loss": 1.2805, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.0148, |
| "grad_norm": 1.3124428987503052, |
| "learning_rate": 4.9997102313038544e-05, |
| "loss": 1.2811, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 1.390687346458435, |
| "learning_rate": 4.999685570873715e-05, |
| "loss": 1.2481, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.0152, |
| "grad_norm": 0.8783305883407593, |
| "learning_rate": 4.999659903570526e-05, |
| "loss": 1.2986, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.0154, |
| "grad_norm": 1.0741727352142334, |
| "learning_rate": 4.999633229404628e-05, |
| "loss": 1.2784, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.0156, |
| "grad_norm": 1.022088885307312, |
| "learning_rate": 4.999605548386763e-05, |
| "loss": 1.2869, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.0158, |
| "grad_norm": 1.0997594594955444, |
| "learning_rate": 4.9995768605280826e-05, |
| "loss": 1.2736, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 1.191188931465149, |
| "learning_rate": 4.9995471658401414e-05, |
| "loss": 1.256, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.016, |
| "eval_loss": 1.1234357357025146, |
| "eval_runtime": 76.115, |
| "eval_samples_per_second": 202.89, |
| "eval_steps_per_second": 3.179, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.0162, |
| "grad_norm": 0.7304887175559998, |
| "learning_rate": 4.9995164643349015e-05, |
| "loss": 1.2717, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.0164, |
| "grad_norm": 1.2335166931152344, |
| "learning_rate": 4.9994847560247276e-05, |
| "loss": 1.2657, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.0166, |
| "grad_norm": 1.424973487854004, |
| "learning_rate": 4.999452040922393e-05, |
| "loss": 1.3235, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.0168, |
| "grad_norm": 1.1544169187545776, |
| "learning_rate": 4.999418319041076e-05, |
| "loss": 1.2455, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 1.1393338441848755, |
| "learning_rate": 4.9993835903943585e-05, |
| "loss": 1.233, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.0172, |
| "grad_norm": 1.1183439493179321, |
| "learning_rate": 4.99934785499623e-05, |
| "loss": 1.2282, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.0174, |
| "grad_norm": 1.275148868560791, |
| "learning_rate": 4.999311112861084e-05, |
| "loss": 1.2665, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.0176, |
| "grad_norm": 1.4136372804641724, |
| "learning_rate": 4.99927336400372e-05, |
| "loss": 1.2617, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.0178, |
| "grad_norm": 1.392327904701233, |
| "learning_rate": 4.999234608439345e-05, |
| "loss": 1.292, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 1.367475152015686, |
| "learning_rate": 4.9991948461835685e-05, |
| "loss": 1.2153, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.018, |
| "eval_loss": 1.1127148866653442, |
| "eval_runtime": 76.2524, |
| "eval_samples_per_second": 202.525, |
| "eval_steps_per_second": 3.174, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.0182, |
| "grad_norm": 0.8793131709098816, |
| "learning_rate": 4.999154077252407e-05, |
| "loss": 1.2734, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.0184, |
| "grad_norm": 0.6496739387512207, |
| "learning_rate": 4.999112301662281e-05, |
| "loss": 1.2498, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.0186, |
| "grad_norm": 1.1462939977645874, |
| "learning_rate": 4.99906951943002e-05, |
| "loss": 1.2549, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.0188, |
| "grad_norm": 1.520691156387329, |
| "learning_rate": 4.999025730572854e-05, |
| "loss": 1.2437, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 1.3555136919021606, |
| "learning_rate": 4.998980935108424e-05, |
| "loss": 1.2326, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 1.467217206954956, |
| "learning_rate": 4.9989351330547715e-05, |
| "loss": 1.2768, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.0194, |
| "grad_norm": 1.3842765092849731, |
| "learning_rate": 4.998888324430346e-05, |
| "loss": 1.2675, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.0196, |
| "grad_norm": 1.344078540802002, |
| "learning_rate": 4.998840509254003e-05, |
| "loss": 1.2619, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.0198, |
| "grad_norm": 0.7567517757415771, |
| "learning_rate": 4.998791687545001e-05, |
| "loss": 1.2794, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.9987697601318359, |
| "learning_rate": 4.998741859323006e-05, |
| "loss": 1.2778, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.02, |
| "eval_loss": 1.1275579929351807, |
| "eval_runtime": 76.2888, |
| "eval_samples_per_second": 202.428, |
| "eval_steps_per_second": 3.172, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.0202, |
| "grad_norm": 1.5212323665618896, |
| "learning_rate": 4.9986910246080894e-05, |
| "loss": 1.2884, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.0204, |
| "grad_norm": 1.5730245113372803, |
| "learning_rate": 4.998639183420727e-05, |
| "loss": 1.282, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.0206, |
| "grad_norm": 0.8342368602752686, |
| "learning_rate": 4.9985863357818e-05, |
| "loss": 1.2408, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.0208, |
| "grad_norm": 1.3672316074371338, |
| "learning_rate": 4.998532481712596e-05, |
| "loss": 1.2205, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 1.1164605617523193, |
| "learning_rate": 4.998477621234806e-05, |
| "loss": 1.2817, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.0212, |
| "grad_norm": 1.2867449522018433, |
| "learning_rate": 4.99842175437053e-05, |
| "loss": 1.2598, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.0214, |
| "grad_norm": 1.6646244525909424, |
| "learning_rate": 4.99836488114227e-05, |
| "loss": 1.2163, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.0216, |
| "grad_norm": 1.3233399391174316, |
| "learning_rate": 4.998307001572935e-05, |
| "loss": 1.2744, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.0218, |
| "grad_norm": 1.1658077239990234, |
| "learning_rate": 4.9982481156858385e-05, |
| "loss": 1.274, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 1.4505467414855957, |
| "learning_rate": 4.9981882235046995e-05, |
| "loss": 1.2645, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.022, |
| "eval_loss": 1.1138958930969238, |
| "eval_runtime": 76.7643, |
| "eval_samples_per_second": 201.174, |
| "eval_steps_per_second": 3.153, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.0222, |
| "grad_norm": 0.8515588641166687, |
| "learning_rate": 4.998127325053642e-05, |
| "loss": 1.2359, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 1.4022259712219238, |
| "learning_rate": 4.9980654203571983e-05, |
| "loss": 1.2515, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.0226, |
| "grad_norm": 1.5902676582336426, |
| "learning_rate": 4.998002509440301e-05, |
| "loss": 1.2305, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.0228, |
| "grad_norm": 0.763087809085846, |
| "learning_rate": 4.997938592328292e-05, |
| "loss": 1.2312, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 1.4949332475662231, |
| "learning_rate": 4.997873669046916e-05, |
| "loss": 1.2768, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.0232, |
| "grad_norm": 1.0390666723251343, |
| "learning_rate": 4.9978077396223255e-05, |
| "loss": 1.2355, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.0234, |
| "grad_norm": 0.6799549460411072, |
| "learning_rate": 4.997740804081076e-05, |
| "loss": 1.264, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.0236, |
| "grad_norm": 1.4702496528625488, |
| "learning_rate": 4.99767286245013e-05, |
| "loss": 1.3092, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.0238, |
| "grad_norm": 1.3574661016464233, |
| "learning_rate": 4.997603914756853e-05, |
| "loss": 1.2654, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 1.1170625686645508, |
| "learning_rate": 4.9975339610290175e-05, |
| "loss": 1.2343, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.024, |
| "eval_loss": 1.1109821796417236, |
| "eval_runtime": 76.4587, |
| "eval_samples_per_second": 201.978, |
| "eval_steps_per_second": 3.165, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.0242, |
| "grad_norm": 1.2707583904266357, |
| "learning_rate": 4.997463001294802e-05, |
| "loss": 1.2525, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.0244, |
| "grad_norm": 1.2613739967346191, |
| "learning_rate": 4.997391035582788e-05, |
| "loss": 1.2698, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.0246, |
| "grad_norm": 1.1995183229446411, |
| "learning_rate": 4.997318063921963e-05, |
| "loss": 1.237, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.0248, |
| "grad_norm": 0.729535698890686, |
| "learning_rate": 4.997244086341721e-05, |
| "loss": 1.2248, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 1.3250787258148193, |
| "learning_rate": 4.9971691028718594e-05, |
| "loss": 1.2617, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.0252, |
| "grad_norm": 1.421278476715088, |
| "learning_rate": 4.997093113542582e-05, |
| "loss": 1.2321, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.0254, |
| "grad_norm": 1.5168310403823853, |
| "learning_rate": 4.997016118384497e-05, |
| "loss": 1.2268, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 1.045483946800232, |
| "learning_rate": 4.996938117428618e-05, |
| "loss": 1.2714, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.0258, |
| "grad_norm": 0.8379656076431274, |
| "learning_rate": 4.9968591107063647e-05, |
| "loss": 1.2792, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 1.620133638381958, |
| "learning_rate": 4.996779098249559e-05, |
| "loss": 1.2456, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.026, |
| "eval_loss": 1.1081608533859253, |
| "eval_runtime": 76.4734, |
| "eval_samples_per_second": 201.939, |
| "eval_steps_per_second": 3.164, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.0262, |
| "grad_norm": 1.2181329727172852, |
| "learning_rate": 4.9966980800904315e-05, |
| "loss": 1.2187, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.0264, |
| "grad_norm": 1.4935636520385742, |
| "learning_rate": 4.996616056261616e-05, |
| "loss": 1.2405, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.0266, |
| "grad_norm": 1.3096436262130737, |
| "learning_rate": 4.996533026796152e-05, |
| "loss": 1.2599, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.0268, |
| "grad_norm": 1.5392045974731445, |
| "learning_rate": 4.996448991727483e-05, |
| "loss": 1.2491, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 1.3175737857818604, |
| "learning_rate": 4.996363951089459e-05, |
| "loss": 1.2383, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.0272, |
| "grad_norm": 1.3839282989501953, |
| "learning_rate": 4.9962779049163335e-05, |
| "loss": 1.2739, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.0274, |
| "grad_norm": 0.8403354287147522, |
| "learning_rate": 4.996190853242767e-05, |
| "loss": 1.2378, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.0276, |
| "grad_norm": 1.2463191747665405, |
| "learning_rate": 4.996102796103823e-05, |
| "loss": 1.2248, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.0278, |
| "grad_norm": 1.466070294380188, |
| "learning_rate": 4.996013733534971e-05, |
| "loss": 1.2567, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 0.8661775588989258, |
| "learning_rate": 4.995923665572085e-05, |
| "loss": 1.2372, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.028, |
| "eval_loss": 1.113655686378479, |
| "eval_runtime": 76.3727, |
| "eval_samples_per_second": 202.206, |
| "eval_steps_per_second": 3.169, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.0282, |
| "grad_norm": 0.9262897968292236, |
| "learning_rate": 4.9958325922514466e-05, |
| "loss": 1.2082, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.0284, |
| "grad_norm": 1.406928539276123, |
| "learning_rate": 4.995740513609738e-05, |
| "loss": 1.2576, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.0286, |
| "grad_norm": 0.9858616590499878, |
| "learning_rate": 4.9956474296840485e-05, |
| "loss": 1.2173, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.0288, |
| "grad_norm": 0.6425116062164307, |
| "learning_rate": 4.9955533405118725e-05, |
| "loss": 1.237, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 0.7704317569732666, |
| "learning_rate": 4.9954582461311106e-05, |
| "loss": 1.286, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.0292, |
| "grad_norm": 1.2745368480682373, |
| "learning_rate": 4.995362146580065e-05, |
| "loss": 1.2553, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.0294, |
| "grad_norm": 1.1889222860336304, |
| "learning_rate": 4.995265041897444e-05, |
| "loss": 1.2783, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.0296, |
| "grad_norm": 1.4223252534866333, |
| "learning_rate": 4.9951669321223645e-05, |
| "loss": 1.27, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.0298, |
| "grad_norm": 1.0991147756576538, |
| "learning_rate": 4.995067817294342e-05, |
| "loss": 1.2373, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.2834559679031372, |
| "learning_rate": 4.994967697453301e-05, |
| "loss": 1.2725, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.03, |
| "eval_loss": 1.1147979497909546, |
| "eval_runtime": 77.4863, |
| "eval_samples_per_second": 199.3, |
| "eval_steps_per_second": 3.123, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.0302, |
| "grad_norm": 1.3690969944000244, |
| "learning_rate": 4.9948665726395705e-05, |
| "loss": 1.2631, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.0304, |
| "grad_norm": 1.0501981973648071, |
| "learning_rate": 4.994764442893882e-05, |
| "loss": 1.2614, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.0306, |
| "grad_norm": 1.2085719108581543, |
| "learning_rate": 4.994661308257375e-05, |
| "loss": 1.1982, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.0308, |
| "grad_norm": 1.1436259746551514, |
| "learning_rate": 4.994557168771591e-05, |
| "loss": 1.2079, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 0.8355712890625, |
| "learning_rate": 4.994452024478478e-05, |
| "loss": 1.2537, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.0312, |
| "grad_norm": 0.9547547698020935, |
| "learning_rate": 4.9943458754203875e-05, |
| "loss": 1.2399, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.0314, |
| "grad_norm": 1.090165138244629, |
| "learning_rate": 4.994238721640077e-05, |
| "loss": 1.2324, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.0316, |
| "grad_norm": 0.9351906180381775, |
| "learning_rate": 4.9941305631807076e-05, |
| "loss": 1.2431, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.0318, |
| "grad_norm": 1.3740676641464233, |
| "learning_rate": 4.9940214000858456e-05, |
| "loss": 1.2487, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.656019926071167, |
| "learning_rate": 4.993911232399462e-05, |
| "loss": 1.2371, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.032, |
| "eval_loss": 1.1028244495391846, |
| "eval_runtime": 76.4629, |
| "eval_samples_per_second": 201.967, |
| "eval_steps_per_second": 3.165, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.0322, |
| "grad_norm": 1.20018470287323, |
| "learning_rate": 4.9938000601659315e-05, |
| "loss": 1.2547, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.0324, |
| "grad_norm": 1.2216906547546387, |
| "learning_rate": 4.993687883430036e-05, |
| "loss": 1.2327, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.0326, |
| "grad_norm": 1.0969616174697876, |
| "learning_rate": 4.99357470223696e-05, |
| "loss": 1.2513, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.0328, |
| "grad_norm": 1.026194453239441, |
| "learning_rate": 4.99346051663229e-05, |
| "loss": 1.2508, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 1.1246017217636108, |
| "learning_rate": 4.993345326662023e-05, |
| "loss": 1.2538, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.0332, |
| "grad_norm": 1.293093204498291, |
| "learning_rate": 4.993229132372557e-05, |
| "loss": 1.2236, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.0334, |
| "grad_norm": 1.208122730255127, |
| "learning_rate": 4.993111933810695e-05, |
| "loss": 1.2753, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.0336, |
| "grad_norm": 1.073480248451233, |
| "learning_rate": 4.992993731023643e-05, |
| "loss": 1.2665, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.0338, |
| "grad_norm": 1.4211028814315796, |
| "learning_rate": 4.9928745240590146e-05, |
| "loss": 1.2388, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 1.1787285804748535, |
| "learning_rate": 4.992754312964827e-05, |
| "loss": 1.2118, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.034, |
| "eval_loss": 1.104814887046814, |
| "eval_runtime": 76.4454, |
| "eval_samples_per_second": 202.013, |
| "eval_steps_per_second": 3.166, |
| "step": 17000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 500000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.41815485464576e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|