| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.015, |
| "eval_steps": 1000, |
| "global_step": 15000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 1e-06, |
| "grad_norm": 1.1795536279678345, |
| "learning_rate": 0.0, |
| "loss": 1.4139, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0001, |
| "grad_norm": 1.1734141111373901, |
| "learning_rate": 9.900000000000001e-08, |
| "loss": 1.387, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 1.1503151655197144, |
| "learning_rate": 1.9900000000000002e-07, |
| "loss": 1.3882, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0003, |
| "grad_norm": 1.1478229761123657, |
| "learning_rate": 2.99e-07, |
| "loss": 1.386, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 1.1559761762619019, |
| "learning_rate": 3.99e-07, |
| "loss": 1.3823, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 1.1433175802230835, |
| "learning_rate": 4.99e-07, |
| "loss": 1.381, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 1.1483551263809204, |
| "learning_rate": 5.990000000000001e-07, |
| "loss": 1.3807, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0007, |
| "grad_norm": 1.161496877670288, |
| "learning_rate": 6.990000000000001e-07, |
| "loss": 1.3833, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 1.139211654663086, |
| "learning_rate": 7.990000000000001e-07, |
| "loss": 1.3835, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0009, |
| "grad_norm": 1.133931040763855, |
| "learning_rate": 8.99e-07, |
| "loss": 1.3719, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 1.1143814325332642, |
| "learning_rate": 9.99e-07, |
| "loss": 1.3761, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.001, |
| "eval_loss": 1.4045685529708862, |
| "eval_runtime": 27.4497, |
| "eval_samples_per_second": 182.152, |
| "eval_steps_per_second": 2.878, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.0011, |
| "grad_norm": 1.153507947921753, |
| "learning_rate": 1.099e-06, |
| "loss": 1.3669, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 1.1281546354293823, |
| "learning_rate": 1.199e-06, |
| "loss": 1.375, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0013, |
| "grad_norm": 1.1093217134475708, |
| "learning_rate": 1.299e-06, |
| "loss": 1.3726, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 1.1526917219161987, |
| "learning_rate": 1.399e-06, |
| "loss": 1.3696, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 1.1092661619186401, |
| "learning_rate": 1.4990000000000002e-06, |
| "loss": 1.3699, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 1.5104150772094727, |
| "learning_rate": 1.599e-06, |
| "loss": 1.3734, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0017, |
| "grad_norm": 1.1301764249801636, |
| "learning_rate": 1.6990000000000002e-06, |
| "loss": 1.3719, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 1.120370626449585, |
| "learning_rate": 1.7990000000000003e-06, |
| "loss": 1.3695, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0019, |
| "grad_norm": 1.145676612854004, |
| "learning_rate": 1.8990000000000004e-06, |
| "loss": 1.3675, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 1.1365715265274048, |
| "learning_rate": 1.9990000000000003e-06, |
| "loss": 1.3616, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.002, |
| "eval_loss": 1.3989644050598145, |
| "eval_runtime": 24.4886, |
| "eval_samples_per_second": 204.177, |
| "eval_steps_per_second": 3.226, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.0021, |
| "grad_norm": 1.118861198425293, |
| "learning_rate": 2.099e-06, |
| "loss": 1.3635, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 1.1307072639465332, |
| "learning_rate": 2.1990000000000005e-06, |
| "loss": 1.375, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.0023, |
| "grad_norm": 1.088172197341919, |
| "learning_rate": 2.299e-06, |
| "loss": 1.3627, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 1.1681883335113525, |
| "learning_rate": 2.3990000000000002e-06, |
| "loss": 1.3607, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 1.1483210325241089, |
| "learning_rate": 2.499e-06, |
| "loss": 1.3687, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.0026, |
| "grad_norm": 1.1572397947311401, |
| "learning_rate": 2.5990000000000004e-06, |
| "loss": 1.3695, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.0027, |
| "grad_norm": 1.124837875366211, |
| "learning_rate": 2.699e-06, |
| "loss": 1.3532, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 1.0974047183990479, |
| "learning_rate": 2.7990000000000002e-06, |
| "loss": 1.3577, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.0029, |
| "grad_norm": 1.1722006797790527, |
| "learning_rate": 2.899e-06, |
| "loss": 1.3673, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 1.106062650680542, |
| "learning_rate": 2.9990000000000004e-06, |
| "loss": 1.36, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.003, |
| "eval_loss": 1.3754355907440186, |
| "eval_runtime": 24.5927, |
| "eval_samples_per_second": 203.312, |
| "eval_steps_per_second": 3.212, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.0031, |
| "grad_norm": 1.1039618253707886, |
| "learning_rate": 3.0990000000000003e-06, |
| "loss": 1.3567, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 1.1439259052276611, |
| "learning_rate": 3.1990000000000006e-06, |
| "loss": 1.3543, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.0033, |
| "grad_norm": 1.1732087135314941, |
| "learning_rate": 3.2990000000000005e-06, |
| "loss": 1.3464, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.0034, |
| "grad_norm": 1.0517069101333618, |
| "learning_rate": 3.399e-06, |
| "loss": 1.3398, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 1.0987197160720825, |
| "learning_rate": 3.4990000000000003e-06, |
| "loss": 1.356, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 1.1524548530578613, |
| "learning_rate": 3.599e-06, |
| "loss": 1.3481, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.0037, |
| "grad_norm": 1.10309636592865, |
| "learning_rate": 3.6990000000000005e-06, |
| "loss": 1.3515, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.0038, |
| "grad_norm": 1.1285984516143799, |
| "learning_rate": 3.7990000000000004e-06, |
| "loss": 1.3541, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.0039, |
| "grad_norm": 1.1621686220169067, |
| "learning_rate": 3.899e-06, |
| "loss": 1.3532, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 1.078803300857544, |
| "learning_rate": 3.999e-06, |
| "loss": 1.3468, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.004, |
| "eval_loss": 1.3711252212524414, |
| "eval_runtime": 24.5467, |
| "eval_samples_per_second": 203.693, |
| "eval_steps_per_second": 3.218, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.0041, |
| "grad_norm": 1.1375211477279663, |
| "learning_rate": 4.099e-06, |
| "loss": 1.341, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.0042, |
| "grad_norm": 1.0922551155090332, |
| "learning_rate": 4.199e-06, |
| "loss": 1.3427, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.0043, |
| "grad_norm": 1.124060034751892, |
| "learning_rate": 4.299000000000001e-06, |
| "loss": 1.3409, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 1.125467300415039, |
| "learning_rate": 4.3990000000000006e-06, |
| "loss": 1.3467, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 1.1384063959121704, |
| "learning_rate": 4.4990000000000005e-06, |
| "loss": 1.3426, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.0046, |
| "grad_norm": 1.1456679105758667, |
| "learning_rate": 4.599e-06, |
| "loss": 1.3445, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.0047, |
| "grad_norm": 1.1553903818130493, |
| "learning_rate": 4.699e-06, |
| "loss": 1.3372, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 1.1315921545028687, |
| "learning_rate": 4.799e-06, |
| "loss": 1.3398, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.0049, |
| "grad_norm": 1.08122980594635, |
| "learning_rate": 4.899e-06, |
| "loss": 1.3364, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1.09906804561615, |
| "learning_rate": 4.999000000000001e-06, |
| "loss": 1.3366, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.005, |
| "eval_loss": 1.3536914587020874, |
| "eval_runtime": 24.5551, |
| "eval_samples_per_second": 203.624, |
| "eval_steps_per_second": 3.217, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.0051, |
| "grad_norm": 1.1291029453277588, |
| "learning_rate": 5.099000000000001e-06, |
| "loss": 1.3396, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 1.1673402786254883, |
| "learning_rate": 5.1990000000000005e-06, |
| "loss": 1.3358, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.0053, |
| "grad_norm": 1.1300634145736694, |
| "learning_rate": 5.2990000000000004e-06, |
| "loss": 1.3384, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.0054, |
| "grad_norm": 1.1179150342941284, |
| "learning_rate": 5.399000000000001e-06, |
| "loss": 1.3332, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 1.091856837272644, |
| "learning_rate": 5.499000000000001e-06, |
| "loss": 1.3348, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 1.0551645755767822, |
| "learning_rate": 5.599e-06, |
| "loss": 1.336, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.0057, |
| "grad_norm": 1.1457860469818115, |
| "learning_rate": 5.699e-06, |
| "loss": 1.333, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.0058, |
| "grad_norm": 1.1662046909332275, |
| "learning_rate": 5.799e-06, |
| "loss": 1.3299, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.0059, |
| "grad_norm": 1.1879452466964722, |
| "learning_rate": 5.899000000000001e-06, |
| "loss": 1.3354, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.1441973447799683, |
| "learning_rate": 5.9990000000000005e-06, |
| "loss": 1.3329, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.006, |
| "eval_loss": 1.3535875082015991, |
| "eval_runtime": 24.3908, |
| "eval_samples_per_second": 204.995, |
| "eval_steps_per_second": 3.239, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0061, |
| "grad_norm": 1.121394395828247, |
| "learning_rate": 6.099e-06, |
| "loss": 1.3295, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.0062, |
| "grad_norm": 1.1496130228042603, |
| "learning_rate": 6.199e-06, |
| "loss": 1.3303, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.0063, |
| "grad_norm": 1.2465569972991943, |
| "learning_rate": 6.299000000000001e-06, |
| "loss": 1.3268, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 1.1363328695297241, |
| "learning_rate": 6.399000000000001e-06, |
| "loss": 1.3248, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 1.1142207384109497, |
| "learning_rate": 6.499000000000001e-06, |
| "loss": 1.3212, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.0066, |
| "grad_norm": 1.1020450592041016, |
| "learning_rate": 6.599000000000001e-06, |
| "loss": 1.3305, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.0067, |
| "grad_norm": 1.0636595487594604, |
| "learning_rate": 6.699000000000001e-06, |
| "loss": 1.3343, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 1.0846408605575562, |
| "learning_rate": 6.7990000000000005e-06, |
| "loss": 1.3306, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.0069, |
| "grad_norm": 1.2017494440078735, |
| "learning_rate": 6.899e-06, |
| "loss": 1.3191, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.159947156906128, |
| "learning_rate": 6.999e-06, |
| "loss": 1.334, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.007, |
| "eval_loss": 1.3385692834854126, |
| "eval_runtime": 24.4488, |
| "eval_samples_per_second": 204.509, |
| "eval_steps_per_second": 3.231, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.0071, |
| "grad_norm": 1.1962409019470215, |
| "learning_rate": 7.099e-06, |
| "loss": 1.323, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 1.1551247835159302, |
| "learning_rate": 7.199e-06, |
| "loss": 1.3119, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.0073, |
| "grad_norm": 1.1543225049972534, |
| "learning_rate": 7.299000000000001e-06, |
| "loss": 1.3261, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.0074, |
| "grad_norm": 1.133355975151062, |
| "learning_rate": 7.399000000000001e-06, |
| "loss": 1.3241, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 1.1490956544876099, |
| "learning_rate": 7.4990000000000005e-06, |
| "loss": 1.3293, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 1.0732618570327759, |
| "learning_rate": 7.5990000000000004e-06, |
| "loss": 1.3216, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.0077, |
| "grad_norm": 1.170203685760498, |
| "learning_rate": 7.699e-06, |
| "loss": 1.3193, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.0078, |
| "grad_norm": 1.0613148212432861, |
| "learning_rate": 7.799000000000001e-06, |
| "loss": 1.329, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.0079, |
| "grad_norm": 1.2019593715667725, |
| "learning_rate": 7.899000000000002e-06, |
| "loss": 1.315, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.1080353260040283, |
| "learning_rate": 7.999e-06, |
| "loss": 1.3181, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.008, |
| "eval_loss": 1.3239587545394897, |
| "eval_runtime": 24.4556, |
| "eval_samples_per_second": 204.452, |
| "eval_steps_per_second": 3.23, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.0081, |
| "grad_norm": 1.1273937225341797, |
| "learning_rate": 8.099e-06, |
| "loss": 1.3252, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.0082, |
| "grad_norm": 1.0942583084106445, |
| "learning_rate": 8.199e-06, |
| "loss": 1.3164, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.0083, |
| "grad_norm": 1.1845577955245972, |
| "learning_rate": 8.299e-06, |
| "loss": 1.32, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 1.2376071214675903, |
| "learning_rate": 8.399e-06, |
| "loss": 1.314, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 1.5554766654968262, |
| "learning_rate": 8.499000000000001e-06, |
| "loss": 1.4128, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.0086, |
| "grad_norm": 1.736693024635315, |
| "learning_rate": 8.599e-06, |
| "loss": 1.5028, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.0087, |
| "grad_norm": 1.8339451551437378, |
| "learning_rate": 8.699000000000001e-06, |
| "loss": 1.5346, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 1.827017068862915, |
| "learning_rate": 8.799000000000002e-06, |
| "loss": 1.5309, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.0089, |
| "grad_norm": 1.7209491729736328, |
| "learning_rate": 8.899e-06, |
| "loss": 1.5202, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1.7649836540222168, |
| "learning_rate": 8.999000000000001e-06, |
| "loss": 1.5311, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.009, |
| "eval_loss": 1.3453279733657837, |
| "eval_runtime": 24.4639, |
| "eval_samples_per_second": 204.383, |
| "eval_steps_per_second": 3.229, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.0091, |
| "grad_norm": 1.758984923362732, |
| "learning_rate": 9.099e-06, |
| "loss": 1.5277, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 1.5517253875732422, |
| "learning_rate": 9.199000000000001e-06, |
| "loss": 1.5331, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.0093, |
| "grad_norm": 1.7491697072982788, |
| "learning_rate": 9.299e-06, |
| "loss": 1.5376, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.0094, |
| "grad_norm": 1.7253761291503906, |
| "learning_rate": 9.399000000000001e-06, |
| "loss": 1.5319, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 1.7779654264450073, |
| "learning_rate": 9.499e-06, |
| "loss": 1.5455, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 1.8502960205078125, |
| "learning_rate": 9.599e-06, |
| "loss": 1.5256, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.0097, |
| "grad_norm": 1.595805287361145, |
| "learning_rate": 9.699e-06, |
| "loss": 1.5338, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.0098, |
| "grad_norm": 1.7826145887374878, |
| "learning_rate": 9.799e-06, |
| "loss": 1.5297, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.0099, |
| "grad_norm": 1.8574384450912476, |
| "learning_rate": 9.899000000000001e-06, |
| "loss": 1.537, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.6225100755691528, |
| "learning_rate": 9.999e-06, |
| "loss": 1.5373, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.01, |
| "eval_loss": 1.3474788665771484, |
| "eval_runtime": 24.6009, |
| "eval_samples_per_second": 203.244, |
| "eval_steps_per_second": 3.211, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.0101, |
| "grad_norm": 1.7013579607009888, |
| "learning_rate": 9.999999753259893e-06, |
| "loss": 1.5213, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.0102, |
| "grad_norm": 1.8451807498931885, |
| "learning_rate": 9.999999003045122e-06, |
| "loss": 1.5252, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.0103, |
| "grad_norm": 1.6487650871276855, |
| "learning_rate": 9.999997749330588e-06, |
| "loss": 1.5313, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 1.7240970134735107, |
| "learning_rate": 9.999995992116415e-06, |
| "loss": 1.5375, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 1.5860111713409424, |
| "learning_rate": 9.999993731402786e-06, |
| "loss": 1.535, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.0106, |
| "grad_norm": 1.6990783214569092, |
| "learning_rate": 9.999990967189924e-06, |
| "loss": 1.5415, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.0107, |
| "grad_norm": 1.7421098947525024, |
| "learning_rate": 9.999987699478109e-06, |
| "loss": 1.5266, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 1.6578110456466675, |
| "learning_rate": 9.999983928267668e-06, |
| "loss": 1.5256, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.0109, |
| "grad_norm": 1.8193341493606567, |
| "learning_rate": 9.999979653558982e-06, |
| "loss": 1.54, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 1.7376822233200073, |
| "learning_rate": 9.999974875352482e-06, |
| "loss": 1.5345, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.011, |
| "eval_loss": 1.3439626693725586, |
| "eval_runtime": 24.6158, |
| "eval_samples_per_second": 203.122, |
| "eval_steps_per_second": 3.209, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.0111, |
| "grad_norm": 1.7770408391952515, |
| "learning_rate": 9.999969593648651e-06, |
| "loss": 1.5257, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 1.703754186630249, |
| "learning_rate": 9.999963808448016e-06, |
| "loss": 1.523, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.0113, |
| "grad_norm": 1.7194414138793945, |
| "learning_rate": 9.999957519751165e-06, |
| "loss": 1.5404, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.0114, |
| "grad_norm": 1.694810390472412, |
| "learning_rate": 9.999950727558727e-06, |
| "loss": 1.534, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 1.644400715827942, |
| "learning_rate": 9.999943431871388e-06, |
| "loss": 1.531, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 1.792406678199768, |
| "learning_rate": 9.99993563268988e-06, |
| "loss": 1.5298, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.0117, |
| "grad_norm": 1.9580830335617065, |
| "learning_rate": 9.999927330014993e-06, |
| "loss": 1.5268, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.0118, |
| "grad_norm": 1.6442023515701294, |
| "learning_rate": 9.99991852384756e-06, |
| "loss": 1.5257, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.0119, |
| "grad_norm": 1.680830478668213, |
| "learning_rate": 9.99990921418847e-06, |
| "loss": 1.5191, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 1.6746671199798584, |
| "learning_rate": 9.999899401038656e-06, |
| "loss": 1.5372, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.012, |
| "eval_loss": 1.3511897325515747, |
| "eval_runtime": 24.555, |
| "eval_samples_per_second": 203.625, |
| "eval_steps_per_second": 3.217, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.0121, |
| "grad_norm": 1.7775862216949463, |
| "learning_rate": 9.99988908439911e-06, |
| "loss": 1.5182, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.0122, |
| "grad_norm": 1.5296705961227417, |
| "learning_rate": 9.999878264270871e-06, |
| "loss": 1.5303, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.0123, |
| "grad_norm": 1.7957079410552979, |
| "learning_rate": 9.999866940655027e-06, |
| "loss": 1.5328, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.0124, |
| "grad_norm": 1.8484801054000854, |
| "learning_rate": 9.99985511355272e-06, |
| "loss": 1.5162, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 1.7253010272979736, |
| "learning_rate": 9.999842782965139e-06, |
| "loss": 1.5178, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.0126, |
| "grad_norm": 1.7495081424713135, |
| "learning_rate": 9.999829948893528e-06, |
| "loss": 1.5233, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.0127, |
| "grad_norm": 1.6750719547271729, |
| "learning_rate": 9.999816611339175e-06, |
| "loss": 1.5203, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 1.7870038747787476, |
| "learning_rate": 9.999802770303427e-06, |
| "loss": 1.5106, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.0129, |
| "grad_norm": 1.6229153871536255, |
| "learning_rate": 9.999788425787678e-06, |
| "loss": 1.5399, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 1.7483490705490112, |
| "learning_rate": 9.99977357779337e-06, |
| "loss": 1.519, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.013, |
| "eval_loss": 1.3341424465179443, |
| "eval_runtime": 24.5433, |
| "eval_samples_per_second": 203.722, |
| "eval_steps_per_second": 3.219, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.0131, |
| "grad_norm": 1.7631748914718628, |
| "learning_rate": 9.999758226322e-06, |
| "loss": 1.5232, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.0132, |
| "grad_norm": 1.6134735345840454, |
| "learning_rate": 9.999742371375114e-06, |
| "loss": 1.5352, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.0133, |
| "grad_norm": 1.8494335412979126, |
| "learning_rate": 9.999726012954308e-06, |
| "loss": 1.5254, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.0134, |
| "grad_norm": 1.9245802164077759, |
| "learning_rate": 9.999709151061228e-06, |
| "loss": 1.5358, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 1.755018711090088, |
| "learning_rate": 9.999691785697574e-06, |
| "loss": 1.5204, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.0136, |
| "grad_norm": 1.8922946453094482, |
| "learning_rate": 9.999673916865094e-06, |
| "loss": 1.5267, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.0137, |
| "grad_norm": 1.9781936407089233, |
| "learning_rate": 9.999655544565587e-06, |
| "loss": 1.5213, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.0138, |
| "grad_norm": 1.8312381505966187, |
| "learning_rate": 9.999636668800905e-06, |
| "loss": 1.517, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.0139, |
| "grad_norm": 1.6503413915634155, |
| "learning_rate": 9.999617289572946e-06, |
| "loss": 1.5169, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 1.8612747192382812, |
| "learning_rate": 9.999597406883664e-06, |
| "loss": 1.5277, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.014, |
| "eval_loss": 1.3367455005645752, |
| "eval_runtime": 24.5718, |
| "eval_samples_per_second": 203.485, |
| "eval_steps_per_second": 3.215, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.0141, |
| "grad_norm": 1.8900790214538574, |
| "learning_rate": 9.999577020735059e-06, |
| "loss": 1.5276, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.0142, |
| "grad_norm": 1.720528244972229, |
| "learning_rate": 9.999556131129184e-06, |
| "loss": 1.5209, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.0143, |
| "grad_norm": 1.713659405708313, |
| "learning_rate": 9.999534738068145e-06, |
| "loss": 1.5194, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.0144, |
| "grad_norm": 1.662377119064331, |
| "learning_rate": 9.999512841554093e-06, |
| "loss": 1.5179, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 1.6507668495178223, |
| "learning_rate": 9.999490441589235e-06, |
| "loss": 1.5181, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.0146, |
| "grad_norm": 1.7075133323669434, |
| "learning_rate": 9.999467538175827e-06, |
| "loss": 1.5203, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.0147, |
| "grad_norm": 1.686068058013916, |
| "learning_rate": 9.999444131316173e-06, |
| "loss": 1.5156, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.0148, |
| "grad_norm": 1.6891603469848633, |
| "learning_rate": 9.999420221012635e-06, |
| "loss": 1.5195, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.0149, |
| "grad_norm": 1.784029245376587, |
| "learning_rate": 9.999395807267616e-06, |
| "loss": 1.509, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 1.6361267566680908, |
| "learning_rate": 9.999370890083575e-06, |
| "loss": 1.5248, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.015, |
| "eval_loss": 1.3349226713180542, |
| "eval_runtime": 24.5747, |
| "eval_samples_per_second": 203.461, |
| "eval_steps_per_second": 3.215, |
| "step": 15000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 1000000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.03079253229568e+18, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|