| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.605851979345955, | |
| "eval_steps": 2500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.0365092754364014, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 0.4163, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.212004542350769, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.3793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.066266417503357, | |
| "learning_rate": 1.88e-05, | |
| "loss": 0.3682, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.322099208831787, | |
| "learning_rate": 1.8400000000000003e-05, | |
| "loss": 0.3536, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.998599648475647, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.3282, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.5098826885223389, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.3092, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.05723237991333, | |
| "learning_rate": 1.72e-05, | |
| "loss": 0.2248, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.0882526636123657, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.2239, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.1547547578811646, | |
| "learning_rate": 1.64e-05, | |
| "loss": 0.2342, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.1294739246368408, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.2158, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.9624162912368774, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 0.2096, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.1864293813705444, | |
| "learning_rate": 1.5200000000000002e-05, | |
| "loss": 0.1787, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1997874975204468, | |
| "learning_rate": 1.48e-05, | |
| "loss": 0.1246, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.2120954990386963, | |
| "learning_rate": 1.4400000000000001e-05, | |
| "loss": 0.1197, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.6992385983467102, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.1185, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.0601509809494019, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 0.1241, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.1058382987976074, | |
| "learning_rate": 1.3200000000000002e-05, | |
| "loss": 0.1282, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 1.1598687171936035, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 0.0847, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 1.2096168994903564, | |
| "learning_rate": 1.2400000000000002e-05, | |
| "loss": 0.0616, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.5343897342681885, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.0645, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 1.165819764137268, | |
| "learning_rate": 1.16e-05, | |
| "loss": 0.0652, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 1.3763171434402466, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.0619, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.9929534792900085, | |
| "learning_rate": 1.0800000000000002e-05, | |
| "loss": 0.0612, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 1.1144566535949707, | |
| "learning_rate": 1.04e-05, | |
| "loss": 0.038, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 1.150139570236206, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0311, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "eval_loss": 0.35374003648757935, | |
| "eval_runtime": 84.8171, | |
| "eval_samples_per_second": 11.79, | |
| "eval_steps_per_second": 2.948, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.4293252229690552, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.0308, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 1.1352962255477905, | |
| "learning_rate": 9.200000000000002e-06, | |
| "loss": 0.0308, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 1.0544779300689697, | |
| "learning_rate": 8.8e-06, | |
| "loss": 0.033, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 1.110599160194397, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 0.0318, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 0.7125316262245178, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.0147, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 0.9172051548957825, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 0.0156, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 5.51, | |
| "grad_norm": 0.9805625081062317, | |
| "learning_rate": 7.2000000000000005e-06, | |
| "loss": 0.0145, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.5053761601448059, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 0.0149, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 1.1218398809432983, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.0168, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "grad_norm": 0.3119220733642578, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0154, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.23416651785373688, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 0.0065, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 6.37, | |
| "grad_norm": 0.6167200803756714, | |
| "learning_rate": 5.2e-06, | |
| "loss": 0.0079, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 1.1704833507537842, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.0067, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 6.71, | |
| "grad_norm": 0.8806678056716919, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.0093, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.30924132466316223, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.007, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.06, | |
| "grad_norm": 0.46306928992271423, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 0.0052, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "grad_norm": 0.46887511014938354, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.0042, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.902063250541687, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.0031, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 7.57, | |
| "grad_norm": 0.1910380870103836, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.0029, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.6202380657196045, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0032, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.5730396509170532, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.0034, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 8.09, | |
| "grad_norm": 0.10635427385568619, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.0034, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 8.26, | |
| "grad_norm": 0.1567939668893814, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.0027, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 8.43, | |
| "grad_norm": 0.11498889327049255, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 0.0015, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 8.61, | |
| "grad_norm": 0.09903218597173691, | |
| "learning_rate": 0.0, | |
| "loss": 0.0017, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.61, | |
| "eval_loss": 0.4493824243545532, | |
| "eval_runtime": 84.77, | |
| "eval_samples_per_second": 11.797, | |
| "eval_steps_per_second": 2.949, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 2500, | |
| "total_flos": 1.258569996863275e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |