| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2254901960784315, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012254901960784314, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.9967, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.024509803921568627, |
| "grad_norm": 0.0010223388671875, |
| "learning_rate": 0.000196, |
| "loss": 0.8313, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03676470588235294, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 0.000194, |
| "loss": 0.768, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.049019607843137254, |
| "grad_norm": 11.25, |
| "learning_rate": 0.000192, |
| "loss": 0.6193, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.061274509803921566, |
| "grad_norm": 0.00038909912109375, |
| "learning_rate": 0.00019, |
| "loss": 0.6394, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.07352941176470588, |
| "grad_norm": 0.001739501953125, |
| "learning_rate": 0.000188, |
| "loss": 0.6524, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0857843137254902, |
| "grad_norm": 0.00115203857421875, |
| "learning_rate": 0.00018600000000000002, |
| "loss": 0.7889, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.09803921568627451, |
| "grad_norm": 2.96875, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.5214, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.11029411764705882, |
| "grad_norm": 0.001251220703125, |
| "learning_rate": 0.000182, |
| "loss": 0.7389, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.12254901960784313, |
| "grad_norm": 5.3125, |
| "learning_rate": 0.00018, |
| "loss": 0.8295, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.13480392156862744, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.907, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.14705882352941177, |
| "grad_norm": 5.0625, |
| "learning_rate": 0.00017600000000000002, |
| "loss": 0.7512, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.15931372549019607, |
| "grad_norm": 0.006805419921875, |
| "learning_rate": 0.000174, |
| "loss": 0.5408, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1715686274509804, |
| "grad_norm": 0.00019550323486328125, |
| "learning_rate": 0.000172, |
| "loss": 1.0083, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.18382352941176472, |
| "grad_norm": 8.046627044677734e-06, |
| "learning_rate": 0.00017, |
| "loss": 0.7069, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.19607843137254902, |
| "grad_norm": 3.921875, |
| "learning_rate": 0.000168, |
| "loss": 0.705, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.20833333333333334, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 0.000166, |
| "loss": 0.8727, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.22058823529411764, |
| "grad_norm": 0.1220703125, |
| "learning_rate": 0.000164, |
| "loss": 0.6222, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.23284313725490197, |
| "grad_norm": 0.002166748046875, |
| "learning_rate": 0.000162, |
| "loss": 0.5294, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.24509803921568626, |
| "grad_norm": 0.00244140625, |
| "learning_rate": 0.00016, |
| "loss": 0.5111, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.25735294117647056, |
| "grad_norm": 6.854534149169922e-06, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.7514, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2696078431372549, |
| "grad_norm": 6.78125, |
| "learning_rate": 0.00015600000000000002, |
| "loss": 0.6244, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2818627450980392, |
| "grad_norm": 0.002227783203125, |
| "learning_rate": 0.000154, |
| "loss": 0.9231, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.000152, |
| "loss": 0.5098, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.30637254901960786, |
| "grad_norm": 0.00982666015625, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.7692, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.31862745098039214, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.000148, |
| "loss": 0.704, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.33088235294117646, |
| "grad_norm": 10.75, |
| "learning_rate": 0.000146, |
| "loss": 0.4461, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.3431372549019608, |
| "grad_norm": 0.0040283203125, |
| "learning_rate": 0.000144, |
| "loss": 0.8833, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3553921568627451, |
| "grad_norm": 1.125, |
| "learning_rate": 0.000142, |
| "loss": 0.7667, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.36764705882352944, |
| "grad_norm": 3.53125, |
| "learning_rate": 0.00014, |
| "loss": 0.4744, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3799019607843137, |
| "grad_norm": 0.0004367828369140625, |
| "learning_rate": 0.000138, |
| "loss": 0.6662, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 0.00011301040649414062, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 0.4074, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.40441176470588236, |
| "grad_norm": 4.57763671875e-05, |
| "learning_rate": 0.000134, |
| "loss": 0.5632, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.0001239776611328125, |
| "learning_rate": 0.000132, |
| "loss": 0.7105, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.42892156862745096, |
| "grad_norm": 0.000339508056640625, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.7244, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.4411764705882353, |
| "grad_norm": 4.125, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 0.7665, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.4534313725490196, |
| "grad_norm": 0.00133514404296875, |
| "learning_rate": 0.000126, |
| "loss": 0.4724, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.46568627450980393, |
| "grad_norm": 3.015625, |
| "learning_rate": 0.000124, |
| "loss": 0.6663, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.47794117647058826, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.000122, |
| "loss": 0.5986, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.49019607843137253, |
| "grad_norm": 0.0004730224609375, |
| "learning_rate": 0.00012, |
| "loss": 0.4979, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5024509803921569, |
| "grad_norm": 15.375, |
| "learning_rate": 0.000118, |
| "loss": 0.6502, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5147058823529411, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.000116, |
| "loss": 0.7484, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.5269607843137255, |
| "grad_norm": 0.000682830810546875, |
| "learning_rate": 0.00011399999999999999, |
| "loss": 0.5358, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5392156862745098, |
| "grad_norm": 0.006134033203125, |
| "learning_rate": 0.00011200000000000001, |
| "loss": 0.6424, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5514705882352942, |
| "grad_norm": 5.14984130859375e-05, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 0.4999, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5637254901960784, |
| "grad_norm": 4.96875, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 0.5711, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5759803921568627, |
| "grad_norm": 4.029273986816406e-05, |
| "learning_rate": 0.00010600000000000002, |
| "loss": 0.6556, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 0.00010442733764648438, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 0.6942, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6004901960784313, |
| "grad_norm": 0.00110626220703125, |
| "learning_rate": 0.00010200000000000001, |
| "loss": 0.837, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6127450980392157, |
| "grad_norm": 6.961822509765625e-05, |
| "learning_rate": 0.0001, |
| "loss": 0.7442, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 3.640625, |
| "learning_rate": 9.8e-05, |
| "loss": 0.683, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6372549019607843, |
| "grad_norm": 0.01531982421875, |
| "learning_rate": 9.6e-05, |
| "loss": 0.7334, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6495098039215687, |
| "grad_norm": 8.916854858398438e-05, |
| "learning_rate": 9.4e-05, |
| "loss": 0.5578, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6617647058823529, |
| "grad_norm": 0.0001468658447265625, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.5033, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6740196078431373, |
| "grad_norm": 0.00014781951904296875, |
| "learning_rate": 9e-05, |
| "loss": 0.3723, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.6862745098039216, |
| "grad_norm": 5.6875, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.0245, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6985294117647058, |
| "grad_norm": 0.0012969970703125, |
| "learning_rate": 8.6e-05, |
| "loss": 0.4378, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.7107843137254902, |
| "grad_norm": 0.0028839111328125, |
| "learning_rate": 8.4e-05, |
| "loss": 0.2688, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.7230392156862745, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 8.2e-05, |
| "loss": 0.5873, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7352941176470589, |
| "grad_norm": 13.0, |
| "learning_rate": 8e-05, |
| "loss": 0.7184, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7475490196078431, |
| "grad_norm": 0.000759124755859375, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.6188, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.7598039215686274, |
| "grad_norm": 0.00311279296875, |
| "learning_rate": 7.6e-05, |
| "loss": 0.6364, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7720588235294118, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 7.4e-05, |
| "loss": 0.3319, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 6.0625, |
| "learning_rate": 7.2e-05, |
| "loss": 0.579, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7965686274509803, |
| "grad_norm": 0.00823974609375, |
| "learning_rate": 7e-05, |
| "loss": 0.3845, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.8088235294117647, |
| "grad_norm": 5.78125, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.5561, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.821078431372549, |
| "grad_norm": 4.0625, |
| "learning_rate": 6.6e-05, |
| "loss": 0.6156, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.0037994384765625, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.4474, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8455882352941176, |
| "grad_norm": 5.1875, |
| "learning_rate": 6.2e-05, |
| "loss": 0.7234, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.8578431372549019, |
| "grad_norm": 0.00020694732666015625, |
| "learning_rate": 6e-05, |
| "loss": 0.6868, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8700980392156863, |
| "grad_norm": 0.00072479248046875, |
| "learning_rate": 5.8e-05, |
| "loss": 0.5759, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 5.25, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 0.6216, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8946078431372549, |
| "grad_norm": 6.625, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.5348, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.9068627450980392, |
| "grad_norm": 0.00012969970703125, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 0.5576, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.9191176470588235, |
| "grad_norm": 0.00020599365234375, |
| "learning_rate": 5e-05, |
| "loss": 0.364, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.9313725490196079, |
| "grad_norm": 0.0027008056640625, |
| "learning_rate": 4.8e-05, |
| "loss": 0.3979, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.9436274509803921, |
| "grad_norm": 0.00030517578125, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.5407, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.9558823529411765, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.3277, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.9681372549019608, |
| "grad_norm": 3.484375, |
| "learning_rate": 4.2e-05, |
| "loss": 0.4503, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.9803921568627451, |
| "grad_norm": 0.000194549560546875, |
| "learning_rate": 4e-05, |
| "loss": 0.4995, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9926470588235294, |
| "grad_norm": 4.75, |
| "learning_rate": 3.8e-05, |
| "loss": 0.5699, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.0049019607843137, |
| "grad_norm": 7.71875, |
| "learning_rate": 3.6e-05, |
| "loss": 0.6477, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.017156862745098, |
| "grad_norm": 4.1484832763671875e-05, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 0.305, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.0294117647058822, |
| "grad_norm": 4.28125, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.455, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.0416666666666667, |
| "grad_norm": 0.0003204345703125, |
| "learning_rate": 3e-05, |
| "loss": 0.371, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.053921568627451, |
| "grad_norm": 0.003509521484375, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.4629, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.0661764705882353, |
| "grad_norm": 0.01300048828125, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.3884, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.0784313725490196, |
| "grad_norm": 0.000324249267578125, |
| "learning_rate": 2.4e-05, |
| "loss": 0.3556, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.0906862745098038, |
| "grad_norm": 0.000209808349609375, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.4463, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.1029411764705883, |
| "grad_norm": 7.53125, |
| "learning_rate": 2e-05, |
| "loss": 0.5725, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.1151960784313726, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.8e-05, |
| "loss": 0.3429, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.1274509803921569, |
| "grad_norm": 0.00016689300537109375, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.3612, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.1397058823529411, |
| "grad_norm": 0.0006256103515625, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.2825, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.1519607843137254, |
| "grad_norm": 5.9375, |
| "learning_rate": 1.2e-05, |
| "loss": 0.2444, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.1642156862745099, |
| "grad_norm": 0.00885009765625, |
| "learning_rate": 1e-05, |
| "loss": 0.395, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 5.34375, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.5501, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1887254901960784, |
| "grad_norm": 9.918212890625e-05, |
| "learning_rate": 6e-06, |
| "loss": 0.2404, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.2009803921568627, |
| "grad_norm": 0.00017452239990234375, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.6754, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.213235294117647, |
| "grad_norm": 0.000152587890625, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.5194, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.2254901960784315, |
| "grad_norm": 3.314018249511719e-05, |
| "learning_rate": 0.0, |
| "loss": 0.3872, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.173511199830016e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|