{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.605851979345955, "eval_steps": 2500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 1.0365092754364014, "learning_rate": 1.9600000000000002e-05, "loss": 0.4163, "step": 100 }, { "epoch": 0.34, "grad_norm": 1.212004542350769, "learning_rate": 1.9200000000000003e-05, "loss": 0.3793, "step": 200 }, { "epoch": 0.52, "grad_norm": 1.066266417503357, "learning_rate": 1.88e-05, "loss": 0.3682, "step": 300 }, { "epoch": 0.69, "grad_norm": 1.322099208831787, "learning_rate": 1.8400000000000003e-05, "loss": 0.3536, "step": 400 }, { "epoch": 0.86, "grad_norm": 0.998599648475647, "learning_rate": 1.8e-05, "loss": 0.3282, "step": 500 }, { "epoch": 1.03, "grad_norm": 1.5098826885223389, "learning_rate": 1.76e-05, "loss": 0.3092, "step": 600 }, { "epoch": 1.2, "grad_norm": 1.05723237991333, "learning_rate": 1.72e-05, "loss": 0.2248, "step": 700 }, { "epoch": 1.38, "grad_norm": 1.0882526636123657, "learning_rate": 1.6800000000000002e-05, "loss": 0.2239, "step": 800 }, { "epoch": 1.55, "grad_norm": 1.1547547578811646, "learning_rate": 1.64e-05, "loss": 0.2342, "step": 900 }, { "epoch": 1.72, "grad_norm": 1.1294739246368408, "learning_rate": 1.6000000000000003e-05, "loss": 0.2158, "step": 1000 }, { "epoch": 1.89, "grad_norm": 0.9624162912368774, "learning_rate": 1.5600000000000003e-05, "loss": 0.2096, "step": 1100 }, { "epoch": 2.07, "grad_norm": 1.1864293813705444, "learning_rate": 1.5200000000000002e-05, "loss": 0.1787, "step": 1200 }, { "epoch": 2.24, "grad_norm": 1.1997874975204468, "learning_rate": 1.48e-05, "loss": 0.1246, "step": 1300 }, { "epoch": 2.41, "grad_norm": 1.2120954990386963, "learning_rate": 1.4400000000000001e-05, "loss": 0.1197, "step": 1400 }, { "epoch": 2.58, "grad_norm": 0.6992385983467102, "learning_rate": 1.4e-05, "loss": 0.1185, "step": 1500 }, { "epoch": 2.75, "grad_norm": 1.0601509809494019, "learning_rate": 1.3600000000000002e-05, "loss": 0.1241, "step": 1600 }, { "epoch": 2.93, "grad_norm": 1.1058382987976074, "learning_rate": 1.3200000000000002e-05, "loss": 0.1282, "step": 1700 }, { "epoch": 3.1, "grad_norm": 1.1598687171936035, "learning_rate": 1.2800000000000001e-05, "loss": 0.0847, "step": 1800 }, { "epoch": 3.27, "grad_norm": 1.2096168994903564, "learning_rate": 1.2400000000000002e-05, "loss": 0.0616, "step": 1900 }, { "epoch": 3.44, "grad_norm": 1.5343897342681885, "learning_rate": 1.2e-05, "loss": 0.0645, "step": 2000 }, { "epoch": 3.61, "grad_norm": 1.165819764137268, "learning_rate": 1.16e-05, "loss": 0.0652, "step": 2100 }, { "epoch": 3.79, "grad_norm": 1.3763171434402466, "learning_rate": 1.1200000000000001e-05, "loss": 0.0619, "step": 2200 }, { "epoch": 3.96, "grad_norm": 0.9929534792900085, "learning_rate": 1.0800000000000002e-05, "loss": 0.0612, "step": 2300 }, { "epoch": 4.13, "grad_norm": 1.1144566535949707, "learning_rate": 1.04e-05, "loss": 0.038, "step": 2400 }, { "epoch": 4.3, "grad_norm": 1.150139570236206, "learning_rate": 1e-05, "loss": 0.0311, "step": 2500 }, { "epoch": 4.3, "eval_loss": 0.35374003648757935, "eval_runtime": 84.8171, "eval_samples_per_second": 11.79, "eval_steps_per_second": 2.948, "step": 2500 }, { "epoch": 4.48, "grad_norm": 1.4293252229690552, "learning_rate": 9.600000000000001e-06, "loss": 0.0308, "step": 2600 }, { "epoch": 4.65, "grad_norm": 1.1352962255477905, "learning_rate": 9.200000000000002e-06, "loss": 0.0308, "step": 2700 }, { "epoch": 4.82, "grad_norm": 1.0544779300689697, "learning_rate": 8.8e-06, "loss": 0.033, "step": 2800 }, { "epoch": 4.99, "grad_norm": 1.110599160194397, "learning_rate": 8.400000000000001e-06, "loss": 0.0318, "step": 2900 }, { "epoch": 5.16, "grad_norm": 0.7125316262245178, "learning_rate": 8.000000000000001e-06, "loss": 0.0147, "step": 3000 }, { "epoch": 5.34, "grad_norm": 0.9172051548957825, "learning_rate": 7.600000000000001e-06, "loss": 0.0156, "step": 3100 }, { "epoch": 5.51, "grad_norm": 0.9805625081062317, "learning_rate": 7.2000000000000005e-06, "loss": 0.0145, "step": 3200 }, { "epoch": 5.68, "grad_norm": 0.5053761601448059, "learning_rate": 6.800000000000001e-06, "loss": 0.0149, "step": 3300 }, { "epoch": 5.85, "grad_norm": 1.1218398809432983, "learning_rate": 6.4000000000000006e-06, "loss": 0.0168, "step": 3400 }, { "epoch": 6.02, "grad_norm": 0.3119220733642578, "learning_rate": 6e-06, "loss": 0.0154, "step": 3500 }, { "epoch": 6.2, "grad_norm": 0.23416651785373688, "learning_rate": 5.600000000000001e-06, "loss": 0.0065, "step": 3600 }, { "epoch": 6.37, "grad_norm": 0.6167200803756714, "learning_rate": 5.2e-06, "loss": 0.0079, "step": 3700 }, { "epoch": 6.54, "grad_norm": 1.1704833507537842, "learning_rate": 4.800000000000001e-06, "loss": 0.0067, "step": 3800 }, { "epoch": 6.71, "grad_norm": 0.8806678056716919, "learning_rate": 4.4e-06, "loss": 0.0093, "step": 3900 }, { "epoch": 6.88, "grad_norm": 0.30924132466316223, "learning_rate": 4.000000000000001e-06, "loss": 0.007, "step": 4000 }, { "epoch": 7.06, "grad_norm": 0.46306928992271423, "learning_rate": 3.6000000000000003e-06, "loss": 0.0052, "step": 4100 }, { "epoch": 7.23, "grad_norm": 0.46887511014938354, "learning_rate": 3.2000000000000003e-06, "loss": 0.0042, "step": 4200 }, { "epoch": 7.4, "grad_norm": 0.902063250541687, "learning_rate": 2.8000000000000003e-06, "loss": 0.0031, "step": 4300 }, { "epoch": 7.57, "grad_norm": 0.1910380870103836, "learning_rate": 2.4000000000000003e-06, "loss": 0.0029, "step": 4400 }, { "epoch": 7.75, "grad_norm": 0.6202380657196045, "learning_rate": 2.0000000000000003e-06, "loss": 0.0032, "step": 4500 }, { "epoch": 7.92, "grad_norm": 0.5730396509170532, "learning_rate": 1.6000000000000001e-06, "loss": 0.0034, "step": 4600 }, { "epoch": 8.09, "grad_norm": 0.10635427385568619, "learning_rate": 1.2000000000000002e-06, "loss": 0.0034, "step": 4700 }, { "epoch": 8.26, "grad_norm": 0.1567939668893814, "learning_rate": 8.000000000000001e-07, "loss": 0.0027, "step": 4800 }, { "epoch": 8.43, "grad_norm": 0.11498889327049255, "learning_rate": 4.0000000000000003e-07, "loss": 0.0015, "step": 4900 }, { "epoch": 8.61, "grad_norm": 0.09903218597173691, "learning_rate": 0.0, "loss": 0.0017, "step": 5000 }, { "epoch": 8.61, "eval_loss": 0.4493824243545532, "eval_runtime": 84.77, "eval_samples_per_second": 11.797, "eval_steps_per_second": 2.949, "step": 5000 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 2500, "total_flos": 1.258569996863275e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }