{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2254901960784315, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012254901960784314, "grad_norm": 1.5234375, "learning_rate": 0.00019800000000000002, "loss": 0.9967, "step": 50 }, { "epoch": 0.024509803921568627, "grad_norm": 0.0010223388671875, "learning_rate": 0.000196, "loss": 0.8313, "step": 100 }, { "epoch": 0.03676470588235294, "grad_norm": 0.00982666015625, "learning_rate": 0.000194, "loss": 0.768, "step": 150 }, { "epoch": 0.049019607843137254, "grad_norm": 11.25, "learning_rate": 0.000192, "loss": 0.6193, "step": 200 }, { "epoch": 0.061274509803921566, "grad_norm": 0.00038909912109375, "learning_rate": 0.00019, "loss": 0.6394, "step": 250 }, { "epoch": 0.07352941176470588, "grad_norm": 0.001739501953125, "learning_rate": 0.000188, "loss": 0.6524, "step": 300 }, { "epoch": 0.0857843137254902, "grad_norm": 0.00115203857421875, "learning_rate": 0.00018600000000000002, "loss": 0.7889, "step": 350 }, { "epoch": 0.09803921568627451, "grad_norm": 2.96875, "learning_rate": 0.00018400000000000003, "loss": 0.5214, "step": 400 }, { "epoch": 0.11029411764705882, "grad_norm": 0.001251220703125, "learning_rate": 0.000182, "loss": 0.7389, "step": 450 }, { "epoch": 0.12254901960784313, "grad_norm": 5.3125, "learning_rate": 0.00018, "loss": 0.8295, "step": 500 }, { "epoch": 0.13480392156862744, "grad_norm": 0.0145263671875, "learning_rate": 0.00017800000000000002, "loss": 0.907, "step": 550 }, { "epoch": 0.14705882352941177, "grad_norm": 5.0625, "learning_rate": 0.00017600000000000002, "loss": 0.7512, "step": 600 }, { "epoch": 0.15931372549019607, "grad_norm": 0.006805419921875, "learning_rate": 0.000174, "loss": 0.5408, "step": 650 }, { "epoch": 0.1715686274509804, "grad_norm": 0.00019550323486328125, "learning_rate": 0.000172, "loss": 1.0083, "step": 700 }, { "epoch": 0.18382352941176472, "grad_norm": 8.046627044677734e-06, "learning_rate": 0.00017, "loss": 0.7069, "step": 750 }, { "epoch": 0.19607843137254902, "grad_norm": 3.921875, "learning_rate": 0.000168, "loss": 0.705, "step": 800 }, { "epoch": 0.20833333333333334, "grad_norm": 0.07373046875, "learning_rate": 0.000166, "loss": 0.8727, "step": 850 }, { "epoch": 0.22058823529411764, "grad_norm": 0.1220703125, "learning_rate": 0.000164, "loss": 0.6222, "step": 900 }, { "epoch": 0.23284313725490197, "grad_norm": 0.002166748046875, "learning_rate": 0.000162, "loss": 0.5294, "step": 950 }, { "epoch": 0.24509803921568626, "grad_norm": 0.00244140625, "learning_rate": 0.00016, "loss": 0.5111, "step": 1000 }, { "epoch": 0.25735294117647056, "grad_norm": 6.854534149169922e-06, "learning_rate": 0.00015800000000000002, "loss": 0.7514, "step": 1050 }, { "epoch": 0.2696078431372549, "grad_norm": 6.78125, "learning_rate": 0.00015600000000000002, "loss": 0.6244, "step": 1100 }, { "epoch": 0.2818627450980392, "grad_norm": 0.002227783203125, "learning_rate": 0.000154, "loss": 0.9231, "step": 1150 }, { "epoch": 0.29411764705882354, "grad_norm": 0.49609375, "learning_rate": 0.000152, "loss": 0.5098, "step": 1200 }, { "epoch": 0.30637254901960786, "grad_norm": 0.00982666015625, "learning_rate": 0.00015000000000000001, "loss": 0.7692, "step": 1250 }, { "epoch": 0.31862745098039214, "grad_norm": 2.15625, "learning_rate": 0.000148, "loss": 0.704, "step": 1300 }, { "epoch": 0.33088235294117646, "grad_norm": 10.75, "learning_rate": 0.000146, "loss": 0.4461, "step": 1350 }, { "epoch": 0.3431372549019608, "grad_norm": 0.0040283203125, "learning_rate": 0.000144, "loss": 0.8833, "step": 1400 }, { "epoch": 0.3553921568627451, "grad_norm": 1.125, "learning_rate": 0.000142, "loss": 0.7667, "step": 1450 }, { "epoch": 0.36764705882352944, "grad_norm": 3.53125, "learning_rate": 0.00014, "loss": 0.4744, "step": 1500 }, { "epoch": 0.3799019607843137, "grad_norm": 0.0004367828369140625, "learning_rate": 0.000138, "loss": 0.6662, "step": 1550 }, { "epoch": 0.39215686274509803, "grad_norm": 0.00011301040649414062, "learning_rate": 0.00013600000000000003, "loss": 0.4074, "step": 1600 }, { "epoch": 0.40441176470588236, "grad_norm": 4.57763671875e-05, "learning_rate": 0.000134, "loss": 0.5632, "step": 1650 }, { "epoch": 0.4166666666666667, "grad_norm": 0.0001239776611328125, "learning_rate": 0.000132, "loss": 0.7105, "step": 1700 }, { "epoch": 0.42892156862745096, "grad_norm": 0.000339508056640625, "learning_rate": 0.00013000000000000002, "loss": 0.7244, "step": 1750 }, { "epoch": 0.4411764705882353, "grad_norm": 4.125, "learning_rate": 0.00012800000000000002, "loss": 0.7665, "step": 1800 }, { "epoch": 0.4534313725490196, "grad_norm": 0.00133514404296875, "learning_rate": 0.000126, "loss": 0.4724, "step": 1850 }, { "epoch": 0.46568627450980393, "grad_norm": 3.015625, "learning_rate": 0.000124, "loss": 0.6663, "step": 1900 }, { "epoch": 0.47794117647058826, "grad_norm": 2.515625, "learning_rate": 0.000122, "loss": 0.5986, "step": 1950 }, { "epoch": 0.49019607843137253, "grad_norm": 0.0004730224609375, "learning_rate": 0.00012, "loss": 0.4979, "step": 2000 }, { "epoch": 0.5024509803921569, "grad_norm": 15.375, "learning_rate": 0.000118, "loss": 0.6502, "step": 2050 }, { "epoch": 0.5147058823529411, "grad_norm": 0.2578125, "learning_rate": 0.000116, "loss": 0.7484, "step": 2100 }, { "epoch": 0.5269607843137255, "grad_norm": 0.000682830810546875, "learning_rate": 0.00011399999999999999, "loss": 0.5358, "step": 2150 }, { "epoch": 0.5392156862745098, "grad_norm": 0.006134033203125, "learning_rate": 0.00011200000000000001, "loss": 0.6424, "step": 2200 }, { "epoch": 0.5514705882352942, "grad_norm": 5.14984130859375e-05, "learning_rate": 0.00011000000000000002, "loss": 0.4999, "step": 2250 }, { "epoch": 0.5637254901960784, "grad_norm": 4.96875, "learning_rate": 0.00010800000000000001, "loss": 0.5711, "step": 2300 }, { "epoch": 0.5759803921568627, "grad_norm": 4.029273986816406e-05, "learning_rate": 0.00010600000000000002, "loss": 0.6556, "step": 2350 }, { "epoch": 0.5882352941176471, "grad_norm": 0.00010442733764648438, "learning_rate": 0.00010400000000000001, "loss": 0.6942, "step": 2400 }, { "epoch": 0.6004901960784313, "grad_norm": 0.00110626220703125, "learning_rate": 0.00010200000000000001, "loss": 0.837, "step": 2450 }, { "epoch": 0.6127450980392157, "grad_norm": 6.961822509765625e-05, "learning_rate": 0.0001, "loss": 0.7442, "step": 2500 }, { "epoch": 0.625, "grad_norm": 3.640625, "learning_rate": 9.8e-05, "loss": 0.683, "step": 2550 }, { "epoch": 0.6372549019607843, "grad_norm": 0.01531982421875, "learning_rate": 9.6e-05, "loss": 0.7334, "step": 2600 }, { "epoch": 0.6495098039215687, "grad_norm": 8.916854858398438e-05, "learning_rate": 9.4e-05, "loss": 0.5578, "step": 2650 }, { "epoch": 0.6617647058823529, "grad_norm": 0.0001468658447265625, "learning_rate": 9.200000000000001e-05, "loss": 0.5033, "step": 2700 }, { "epoch": 0.6740196078431373, "grad_norm": 0.00014781951904296875, "learning_rate": 9e-05, "loss": 0.3723, "step": 2750 }, { "epoch": 0.6862745098039216, "grad_norm": 5.6875, "learning_rate": 8.800000000000001e-05, "loss": 1.0245, "step": 2800 }, { "epoch": 0.6985294117647058, "grad_norm": 0.0012969970703125, "learning_rate": 8.6e-05, "loss": 0.4378, "step": 2850 }, { "epoch": 0.7107843137254902, "grad_norm": 0.0028839111328125, "learning_rate": 8.4e-05, "loss": 0.2688, "step": 2900 }, { "epoch": 0.7230392156862745, "grad_norm": 0.03857421875, "learning_rate": 8.2e-05, "loss": 0.5873, "step": 2950 }, { "epoch": 0.7352941176470589, "grad_norm": 13.0, "learning_rate": 8e-05, "loss": 0.7184, "step": 3000 }, { "epoch": 0.7475490196078431, "grad_norm": 0.000759124755859375, "learning_rate": 7.800000000000001e-05, "loss": 0.6188, "step": 3050 }, { "epoch": 0.7598039215686274, "grad_norm": 0.00311279296875, "learning_rate": 7.6e-05, "loss": 0.6364, "step": 3100 }, { "epoch": 0.7720588235294118, "grad_norm": 0.00958251953125, "learning_rate": 7.4e-05, "loss": 0.3319, "step": 3150 }, { "epoch": 0.7843137254901961, "grad_norm": 6.0625, "learning_rate": 7.2e-05, "loss": 0.579, "step": 3200 }, { "epoch": 0.7965686274509803, "grad_norm": 0.00823974609375, "learning_rate": 7e-05, "loss": 0.3845, "step": 3250 }, { "epoch": 0.8088235294117647, "grad_norm": 5.78125, "learning_rate": 6.800000000000001e-05, "loss": 0.5561, "step": 3300 }, { "epoch": 0.821078431372549, "grad_norm": 4.0625, "learning_rate": 6.6e-05, "loss": 0.6156, "step": 3350 }, { "epoch": 0.8333333333333334, "grad_norm": 0.0037994384765625, "learning_rate": 6.400000000000001e-05, "loss": 0.4474, "step": 3400 }, { "epoch": 0.8455882352941176, "grad_norm": 5.1875, "learning_rate": 6.2e-05, "loss": 0.7234, "step": 3450 }, { "epoch": 0.8578431372549019, "grad_norm": 0.00020694732666015625, "learning_rate": 6e-05, "loss": 0.6868, "step": 3500 }, { "epoch": 0.8700980392156863, "grad_norm": 0.00072479248046875, "learning_rate": 5.8e-05, "loss": 0.5759, "step": 3550 }, { "epoch": 0.8823529411764706, "grad_norm": 5.25, "learning_rate": 5.6000000000000006e-05, "loss": 0.6216, "step": 3600 }, { "epoch": 0.8946078431372549, "grad_norm": 6.625, "learning_rate": 5.4000000000000005e-05, "loss": 0.5348, "step": 3650 }, { "epoch": 0.9068627450980392, "grad_norm": 0.00012969970703125, "learning_rate": 5.2000000000000004e-05, "loss": 0.5576, "step": 3700 }, { "epoch": 0.9191176470588235, "grad_norm": 0.00020599365234375, "learning_rate": 5e-05, "loss": 0.364, "step": 3750 }, { "epoch": 0.9313725490196079, "grad_norm": 0.0027008056640625, "learning_rate": 4.8e-05, "loss": 0.3979, "step": 3800 }, { "epoch": 0.9436274509803921, "grad_norm": 0.00030517578125, "learning_rate": 4.600000000000001e-05, "loss": 0.5407, "step": 3850 }, { "epoch": 0.9558823529411765, "grad_norm": 0.0086669921875, "learning_rate": 4.4000000000000006e-05, "loss": 0.3277, "step": 3900 }, { "epoch": 0.9681372549019608, "grad_norm": 3.484375, "learning_rate": 4.2e-05, "loss": 0.4503, "step": 3950 }, { "epoch": 0.9803921568627451, "grad_norm": 0.000194549560546875, "learning_rate": 4e-05, "loss": 0.4995, "step": 4000 }, { "epoch": 0.9926470588235294, "grad_norm": 4.75, "learning_rate": 3.8e-05, "loss": 0.5699, "step": 4050 }, { "epoch": 1.0049019607843137, "grad_norm": 7.71875, "learning_rate": 3.6e-05, "loss": 0.6477, "step": 4100 }, { "epoch": 1.017156862745098, "grad_norm": 4.1484832763671875e-05, "learning_rate": 3.4000000000000007e-05, "loss": 0.305, "step": 4150 }, { "epoch": 1.0294117647058822, "grad_norm": 4.28125, "learning_rate": 3.2000000000000005e-05, "loss": 0.455, "step": 4200 }, { "epoch": 1.0416666666666667, "grad_norm": 0.0003204345703125, "learning_rate": 3e-05, "loss": 0.371, "step": 4250 }, { "epoch": 1.053921568627451, "grad_norm": 0.003509521484375, "learning_rate": 2.8000000000000003e-05, "loss": 0.4629, "step": 4300 }, { "epoch": 1.0661764705882353, "grad_norm": 0.01300048828125, "learning_rate": 2.6000000000000002e-05, "loss": 0.3884, "step": 4350 }, { "epoch": 1.0784313725490196, "grad_norm": 0.000324249267578125, "learning_rate": 2.4e-05, "loss": 0.3556, "step": 4400 }, { "epoch": 1.0906862745098038, "grad_norm": 0.000209808349609375, "learning_rate": 2.2000000000000003e-05, "loss": 0.4463, "step": 4450 }, { "epoch": 1.1029411764705883, "grad_norm": 7.53125, "learning_rate": 2e-05, "loss": 0.5725, "step": 4500 }, { "epoch": 1.1151960784313726, "grad_norm": 6.4375, "learning_rate": 1.8e-05, "loss": 0.3429, "step": 4550 }, { "epoch": 1.1274509803921569, "grad_norm": 0.00016689300537109375, "learning_rate": 1.6000000000000003e-05, "loss": 0.3612, "step": 4600 }, { "epoch": 1.1397058823529411, "grad_norm": 0.0006256103515625, "learning_rate": 1.4000000000000001e-05, "loss": 0.2825, "step": 4650 }, { "epoch": 1.1519607843137254, "grad_norm": 5.9375, "learning_rate": 1.2e-05, "loss": 0.2444, "step": 4700 }, { "epoch": 1.1642156862745099, "grad_norm": 0.00885009765625, "learning_rate": 1e-05, "loss": 0.395, "step": 4750 }, { "epoch": 1.1764705882352942, "grad_norm": 5.34375, "learning_rate": 8.000000000000001e-06, "loss": 0.5501, "step": 4800 }, { "epoch": 1.1887254901960784, "grad_norm": 9.918212890625e-05, "learning_rate": 6e-06, "loss": 0.2404, "step": 4850 }, { "epoch": 1.2009803921568627, "grad_norm": 0.00017452239990234375, "learning_rate": 4.000000000000001e-06, "loss": 0.6754, "step": 4900 }, { "epoch": 1.213235294117647, "grad_norm": 0.000152587890625, "learning_rate": 2.0000000000000003e-06, "loss": 0.5194, "step": 4950 }, { "epoch": 1.2254901960784315, "grad_norm": 3.314018249511719e-05, "learning_rate": 0.0, "loss": 0.3872, "step": 5000 } ], "logging_steps": 50, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.173511199830016e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }