| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 30720, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009765625, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.984375e-05, | |
| "loss": 6.5469, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01953125, | |
| "grad_norm": 15.062654495239258, | |
| "learning_rate": 4.9682617187500003e-05, | |
| "loss": 5.3177, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.029296875, | |
| "grad_norm": 13.20622730255127, | |
| "learning_rate": 4.9519856770833334e-05, | |
| "loss": 4.8951, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 14.794780731201172, | |
| "learning_rate": 4.935709635416667e-05, | |
| "loss": 4.8799, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.048828125, | |
| "grad_norm": 13.915122985839844, | |
| "learning_rate": 4.919596354166667e-05, | |
| "loss": 4.5381, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05859375, | |
| "grad_norm": 15.835251808166504, | |
| "learning_rate": 4.9033203125000005e-05, | |
| "loss": 4.4279, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.068359375, | |
| "grad_norm": 14.109216690063477, | |
| "learning_rate": 4.8870442708333335e-05, | |
| "loss": 4.2612, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 13.910305976867676, | |
| "learning_rate": 4.8707682291666666e-05, | |
| "loss": 4.2912, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.087890625, | |
| "grad_norm": 15.284989356994629, | |
| "learning_rate": 4.854654947916667e-05, | |
| "loss": 4.3003, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 16.042747497558594, | |
| "learning_rate": 4.83837890625e-05, | |
| "loss": 4.0609, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.107421875, | |
| "grad_norm": 12.1378755569458, | |
| "learning_rate": 4.8221028645833336e-05, | |
| "loss": 4.1666, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 13.27160930633545, | |
| "learning_rate": 4.805826822916667e-05, | |
| "loss": 4.1419, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.126953125, | |
| "grad_norm": 16.022241592407227, | |
| "learning_rate": 4.78955078125e-05, | |
| "loss": 4.1586, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.13671875, | |
| "grad_norm": 17.19588851928711, | |
| "learning_rate": 4.7732747395833336e-05, | |
| "loss": 3.9012, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.146484375, | |
| "grad_norm": 13.319019317626953, | |
| "learning_rate": 4.756998697916667e-05, | |
| "loss": 4.017, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 18.409399032592773, | |
| "learning_rate": 4.74072265625e-05, | |
| "loss": 3.9484, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.166015625, | |
| "grad_norm": 13.017057418823242, | |
| "learning_rate": 4.7244466145833336e-05, | |
| "loss": 3.9405, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.17578125, | |
| "grad_norm": 20.20134162902832, | |
| "learning_rate": 4.7081705729166667e-05, | |
| "loss": 3.8163, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.185546875, | |
| "grad_norm": 12.69709587097168, | |
| "learning_rate": 4.6918945312500004e-05, | |
| "loss": 3.8031, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 15.624823570251465, | |
| "learning_rate": 4.6756184895833335e-05, | |
| "loss": 3.862, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.205078125, | |
| "grad_norm": 18.287826538085938, | |
| "learning_rate": 4.659342447916667e-05, | |
| "loss": 3.8113, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.21484375, | |
| "grad_norm": 13.958403587341309, | |
| "learning_rate": 4.6430664062500004e-05, | |
| "loss": 3.7175, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.224609375, | |
| "grad_norm": 14.171894073486328, | |
| "learning_rate": 4.6267903645833335e-05, | |
| "loss": 3.8023, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 13.721501350402832, | |
| "learning_rate": 4.610514322916667e-05, | |
| "loss": 3.7811, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.244140625, | |
| "grad_norm": 15.861538887023926, | |
| "learning_rate": 4.5942382812500003e-05, | |
| "loss": 3.6891, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.25390625, | |
| "grad_norm": 12.93659496307373, | |
| "learning_rate": 4.577962239583334e-05, | |
| "loss": 3.6613, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.263671875, | |
| "grad_norm": 14.38327407836914, | |
| "learning_rate": 4.561686197916667e-05, | |
| "loss": 4.0671, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 21.85599136352539, | |
| "learning_rate": 4.54541015625e-05, | |
| "loss": 3.6713, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.283203125, | |
| "grad_norm": 16.765880584716797, | |
| "learning_rate": 4.529134114583334e-05, | |
| "loss": 3.787, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 12.277482032775879, | |
| "learning_rate": 4.512858072916667e-05, | |
| "loss": 3.634, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.302734375, | |
| "grad_norm": 14.076525688171387, | |
| "learning_rate": 4.49658203125e-05, | |
| "loss": 3.6676, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 54.203609466552734, | |
| "learning_rate": 4.480305989583334e-05, | |
| "loss": 3.4946, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.322265625, | |
| "grad_norm": 17.947622299194336, | |
| "learning_rate": 4.464029947916667e-05, | |
| "loss": 3.6398, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.33203125, | |
| "grad_norm": 12.46594524383545, | |
| "learning_rate": 4.44775390625e-05, | |
| "loss": 3.6395, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.341796875, | |
| "grad_norm": 14.41441822052002, | |
| "learning_rate": 4.431477864583334e-05, | |
| "loss": 3.5954, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 11.247319221496582, | |
| "learning_rate": 4.415201822916667e-05, | |
| "loss": 3.7537, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.361328125, | |
| "grad_norm": 15.799861907958984, | |
| "learning_rate": 4.39892578125e-05, | |
| "loss": 3.7104, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.37109375, | |
| "grad_norm": 13.855748176574707, | |
| "learning_rate": 4.382649739583334e-05, | |
| "loss": 3.6212, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.380859375, | |
| "grad_norm": 14.83035659790039, | |
| "learning_rate": 4.366373697916667e-05, | |
| "loss": 3.5802, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 13.719660758972168, | |
| "learning_rate": 4.35009765625e-05, | |
| "loss": 3.4662, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.400390625, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.333821614583334e-05, | |
| "loss": 3.5461, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.41015625, | |
| "grad_norm": 13.233614921569824, | |
| "learning_rate": 4.3177083333333334e-05, | |
| "loss": 3.5612, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.419921875, | |
| "grad_norm": 16.629112243652344, | |
| "learning_rate": 4.3014322916666665e-05, | |
| "loss": 3.4333, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 84.17444610595703, | |
| "learning_rate": 4.28515625e-05, | |
| "loss": 3.4819, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.439453125, | |
| "grad_norm": 14.212441444396973, | |
| "learning_rate": 4.268880208333333e-05, | |
| "loss": 3.4069, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.44921875, | |
| "grad_norm": 20.573959350585938, | |
| "learning_rate": 4.2526041666666664e-05, | |
| "loss": 3.6456, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.458984375, | |
| "grad_norm": 13.834342956542969, | |
| "learning_rate": 4.236328125e-05, | |
| "loss": 3.5382, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 14.079811096191406, | |
| "learning_rate": 4.220052083333333e-05, | |
| "loss": 3.4105, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.478515625, | |
| "grad_norm": 12.509760856628418, | |
| "learning_rate": 4.2037760416666664e-05, | |
| "loss": 3.3461, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 14.564861297607422, | |
| "learning_rate": 4.1875e-05, | |
| "loss": 3.4041, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.498046875, | |
| "grad_norm": 14.737241744995117, | |
| "learning_rate": 4.171223958333333e-05, | |
| "loss": 3.507, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 15.188499450683594, | |
| "learning_rate": 4.154947916666666e-05, | |
| "loss": 3.4917, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.517578125, | |
| "grad_norm": 12.736299514770508, | |
| "learning_rate": 4.138671875e-05, | |
| "loss": 3.3674, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.52734375, | |
| "grad_norm": 13.326681137084961, | |
| "learning_rate": 4.122395833333333e-05, | |
| "loss": 3.4286, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.537109375, | |
| "grad_norm": 12.76373291015625, | |
| "learning_rate": 4.106119791666666e-05, | |
| "loss": 3.4353, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 15.256406784057617, | |
| "learning_rate": 4.08984375e-05, | |
| "loss": 3.4297, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.556640625, | |
| "grad_norm": 11.880370140075684, | |
| "learning_rate": 4.073567708333333e-05, | |
| "loss": 3.2847, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.56640625, | |
| "grad_norm": 15.305662155151367, | |
| "learning_rate": 4.057291666666667e-05, | |
| "loss": 3.3476, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.576171875, | |
| "grad_norm": 24.128557205200195, | |
| "learning_rate": 4.041015625e-05, | |
| "loss": 3.4455, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 13.900318145751953, | |
| "learning_rate": 4.024739583333334e-05, | |
| "loss": 3.4846, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.595703125, | |
| "grad_norm": 11.240140914916992, | |
| "learning_rate": 4.008463541666667e-05, | |
| "loss": 3.4453, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.60546875, | |
| "grad_norm": 14.209588050842285, | |
| "learning_rate": 3.9921875e-05, | |
| "loss": 3.3685, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.615234375, | |
| "grad_norm": 16.545690536499023, | |
| "learning_rate": 3.975911458333334e-05, | |
| "loss": 3.3857, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 12.255586624145508, | |
| "learning_rate": 3.959635416666667e-05, | |
| "loss": 3.3588, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.634765625, | |
| "grad_norm": 15.788583755493164, | |
| "learning_rate": 3.9433593750000006e-05, | |
| "loss": 3.4022, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.64453125, | |
| "grad_norm": 11.233269691467285, | |
| "learning_rate": 3.92724609375e-05, | |
| "loss": 3.3202, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.654296875, | |
| "grad_norm": 16.585798263549805, | |
| "learning_rate": 3.910970052083334e-05, | |
| "loss": 3.2925, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 15.305195808410645, | |
| "learning_rate": 3.894694010416667e-05, | |
| "loss": 3.3162, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.673828125, | |
| "grad_norm": 11.933198928833008, | |
| "learning_rate": 3.87841796875e-05, | |
| "loss": 3.3133, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 15.607390403747559, | |
| "learning_rate": 3.862141927083334e-05, | |
| "loss": 3.3407, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.693359375, | |
| "grad_norm": 9.2060546875, | |
| "learning_rate": 3.845865885416667e-05, | |
| "loss": 3.3973, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 17.788448333740234, | |
| "learning_rate": 3.82958984375e-05, | |
| "loss": 3.261, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.712890625, | |
| "grad_norm": 13.37771224975586, | |
| "learning_rate": 3.813313802083334e-05, | |
| "loss": 3.3159, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.72265625, | |
| "grad_norm": 13.427688598632812, | |
| "learning_rate": 3.797037760416667e-05, | |
| "loss": 3.2451, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.732421875, | |
| "grad_norm": 12.314142227172852, | |
| "learning_rate": 3.78076171875e-05, | |
| "loss": 3.2721, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 11.418351173400879, | |
| "learning_rate": 3.764485677083334e-05, | |
| "loss": 3.2943, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.751953125, | |
| "grad_norm": 16.546539306640625, | |
| "learning_rate": 3.748209635416667e-05, | |
| "loss": 3.2243, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.76171875, | |
| "grad_norm": 16.984668731689453, | |
| "learning_rate": 3.73193359375e-05, | |
| "loss": 3.3024, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.771484375, | |
| "grad_norm": 11.702521324157715, | |
| "learning_rate": 3.715657552083334e-05, | |
| "loss": 3.1967, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 12.537822723388672, | |
| "learning_rate": 3.699381510416667e-05, | |
| "loss": 3.2535, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.791015625, | |
| "grad_norm": 13.640584945678711, | |
| "learning_rate": 3.68310546875e-05, | |
| "loss": 3.1677, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.80078125, | |
| "grad_norm": 15.423649787902832, | |
| "learning_rate": 3.6668294270833336e-05, | |
| "loss": 3.2641, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.810546875, | |
| "grad_norm": 12.065776824951172, | |
| "learning_rate": 3.650553385416667e-05, | |
| "loss": 3.1169, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 14.259243965148926, | |
| "learning_rate": 3.63427734375e-05, | |
| "loss": 3.2357, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.830078125, | |
| "grad_norm": 17.08042335510254, | |
| "learning_rate": 3.6180013020833336e-05, | |
| "loss": 3.3104, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.83984375, | |
| "grad_norm": 10.867400169372559, | |
| "learning_rate": 3.601725260416667e-05, | |
| "loss": 3.296, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.849609375, | |
| "grad_norm": 13.106012344360352, | |
| "learning_rate": 3.58544921875e-05, | |
| "loss": 3.2956, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 15.86103630065918, | |
| "learning_rate": 3.5691731770833335e-05, | |
| "loss": 3.2569, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.869140625, | |
| "grad_norm": 11.86782169342041, | |
| "learning_rate": 3.5528971354166666e-05, | |
| "loss": 3.1775, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 13.925124168395996, | |
| "learning_rate": 3.53662109375e-05, | |
| "loss": 3.1556, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.888671875, | |
| "grad_norm": 15.298331260681152, | |
| "learning_rate": 3.5203450520833335e-05, | |
| "loss": 3.2081, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 12.989383697509766, | |
| "learning_rate": 3.5040690104166666e-05, | |
| "loss": 3.2758, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.908203125, | |
| "grad_norm": 12.797562599182129, | |
| "learning_rate": 3.48779296875e-05, | |
| "loss": 3.0895, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.91796875, | |
| "grad_norm": 10.773366928100586, | |
| "learning_rate": 3.4715169270833335e-05, | |
| "loss": 3.1352, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.927734375, | |
| "grad_norm": 13.436513900756836, | |
| "learning_rate": 3.4552408854166665e-05, | |
| "loss": 3.2934, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 12.20578670501709, | |
| "learning_rate": 3.4389648437499996e-05, | |
| "loss": 3.22, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.947265625, | |
| "grad_norm": 15.03205680847168, | |
| "learning_rate": 3.4226888020833334e-05, | |
| "loss": 3.1925, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.95703125, | |
| "grad_norm": 15.373735427856445, | |
| "learning_rate": 3.4064127604166665e-05, | |
| "loss": 3.1056, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.966796875, | |
| "grad_norm": 13.795890808105469, | |
| "learning_rate": 3.39013671875e-05, | |
| "loss": 3.1484, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 15.731973648071289, | |
| "learning_rate": 3.3738606770833334e-05, | |
| "loss": 3.2102, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.986328125, | |
| "grad_norm": 14.516192436218262, | |
| "learning_rate": 3.3575846354166665e-05, | |
| "loss": 3.1795, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.99609375, | |
| "grad_norm": 11.511063575744629, | |
| "learning_rate": 3.34130859375e-05, | |
| "loss": 3.1479, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.005859375, | |
| "grad_norm": 13.947763442993164, | |
| "learning_rate": 3.325032552083333e-05, | |
| "loss": 3.1697, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 9.85244369506836, | |
| "learning_rate": 3.308756510416667e-05, | |
| "loss": 3.1739, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.025390625, | |
| "grad_norm": 21.973459243774414, | |
| "learning_rate": 3.29248046875e-05, | |
| "loss": 2.9587, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.03515625, | |
| "grad_norm": 11.420741081237793, | |
| "learning_rate": 3.276204427083334e-05, | |
| "loss": 3.1121, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.044921875, | |
| "grad_norm": 10.605298042297363, | |
| "learning_rate": 3.259928385416667e-05, | |
| "loss": 3.0928, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.0546875, | |
| "grad_norm": 15.158004760742188, | |
| "learning_rate": 3.24365234375e-05, | |
| "loss": 3.0272, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.064453125, | |
| "grad_norm": 12.386876106262207, | |
| "learning_rate": 3.227376302083334e-05, | |
| "loss": 3.055, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.07421875, | |
| "grad_norm": 14.404585838317871, | |
| "learning_rate": 3.211100260416667e-05, | |
| "loss": 3.0203, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.083984375, | |
| "grad_norm": 14.187972068786621, | |
| "learning_rate": 3.19482421875e-05, | |
| "loss": 3.1315, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 11.506830215454102, | |
| "learning_rate": 3.178548177083334e-05, | |
| "loss": 3.0323, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.103515625, | |
| "grad_norm": 14.504237174987793, | |
| "learning_rate": 3.162272135416667e-05, | |
| "loss": 2.9987, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.11328125, | |
| "grad_norm": 12.95081615447998, | |
| "learning_rate": 3.14599609375e-05, | |
| "loss": 2.9773, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.123046875, | |
| "grad_norm": 12.512577056884766, | |
| "learning_rate": 3.129720052083334e-05, | |
| "loss": 3.1454, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.1328125, | |
| "grad_norm": 11.951395988464355, | |
| "learning_rate": 3.113444010416667e-05, | |
| "loss": 2.9314, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.142578125, | |
| "grad_norm": 16.119001388549805, | |
| "learning_rate": 3.09716796875e-05, | |
| "loss": 2.9137, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.15234375, | |
| "grad_norm": 10.542379379272461, | |
| "learning_rate": 3.080891927083334e-05, | |
| "loss": 3.1176, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.162109375, | |
| "grad_norm": 14.544652938842773, | |
| "learning_rate": 3.064615885416667e-05, | |
| "loss": 2.9937, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 11.01784896850586, | |
| "learning_rate": 3.04833984375e-05, | |
| "loss": 3.0301, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.181640625, | |
| "grad_norm": 12.359283447265625, | |
| "learning_rate": 3.0320638020833337e-05, | |
| "loss": 2.9277, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.19140625, | |
| "grad_norm": 18.171897888183594, | |
| "learning_rate": 3.015787760416667e-05, | |
| "loss": 3.0673, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.201171875, | |
| "grad_norm": 12.454042434692383, | |
| "learning_rate": 2.99951171875e-05, | |
| "loss": 2.9398, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.2109375, | |
| "grad_norm": 12.611340522766113, | |
| "learning_rate": 2.9832356770833337e-05, | |
| "loss": 2.926, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.220703125, | |
| "grad_norm": 12.207980155944824, | |
| "learning_rate": 2.9669596354166668e-05, | |
| "loss": 2.9713, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.23046875, | |
| "grad_norm": 11.924321174621582, | |
| "learning_rate": 2.95068359375e-05, | |
| "loss": 3.0734, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.240234375, | |
| "grad_norm": 10.981707572937012, | |
| "learning_rate": 2.9344075520833337e-05, | |
| "loss": 2.983, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 9.699291229248047, | |
| "learning_rate": 2.9181315104166667e-05, | |
| "loss": 3.0235, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.259765625, | |
| "grad_norm": 11.410511016845703, | |
| "learning_rate": 2.90185546875e-05, | |
| "loss": 3.081, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.26953125, | |
| "grad_norm": 20.204944610595703, | |
| "learning_rate": 2.8855794270833336e-05, | |
| "loss": 2.958, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.279296875, | |
| "grad_norm": 9.388766288757324, | |
| "learning_rate": 2.8693033854166667e-05, | |
| "loss": 3.0007, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.2890625, | |
| "grad_norm": 11.05562973022461, | |
| "learning_rate": 2.8530273437499998e-05, | |
| "loss": 3.0186, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.298828125, | |
| "grad_norm": 12.052275657653809, | |
| "learning_rate": 2.8367513020833336e-05, | |
| "loss": 2.9891, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.30859375, | |
| "grad_norm": 17.93643569946289, | |
| "learning_rate": 2.8204752604166667e-05, | |
| "loss": 2.9863, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.318359375, | |
| "grad_norm": 16.745187759399414, | |
| "learning_rate": 2.8041992187499998e-05, | |
| "loss": 2.963, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 11.541468620300293, | |
| "learning_rate": 2.7879231770833335e-05, | |
| "loss": 3.0955, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.337890625, | |
| "grad_norm": 12.31360912322998, | |
| "learning_rate": 2.7716471354166666e-05, | |
| "loss": 3.0445, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.34765625, | |
| "grad_norm": 12.349579811096191, | |
| "learning_rate": 2.75537109375e-05, | |
| "loss": 2.9165, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.357421875, | |
| "grad_norm": 11.030657768249512, | |
| "learning_rate": 2.7390950520833335e-05, | |
| "loss": 2.9315, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "grad_norm": 14.333861351013184, | |
| "learning_rate": 2.722819010416667e-05, | |
| "loss": 3.0059, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.376953125, | |
| "grad_norm": 15.17336368560791, | |
| "learning_rate": 2.70654296875e-05, | |
| "loss": 3.0364, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.38671875, | |
| "grad_norm": 14.133316040039062, | |
| "learning_rate": 2.6902669270833338e-05, | |
| "loss": 3.0579, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.396484375, | |
| "grad_norm": 11.148407936096191, | |
| "learning_rate": 2.673990885416667e-05, | |
| "loss": 2.975, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 14.075923919677734, | |
| "learning_rate": 2.65771484375e-05, | |
| "loss": 3.0044, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.416015625, | |
| "grad_norm": 14.998821258544922, | |
| "learning_rate": 2.6414388020833337e-05, | |
| "loss": 2.9768, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.42578125, | |
| "grad_norm": 12.305022239685059, | |
| "learning_rate": 2.6251627604166668e-05, | |
| "loss": 2.9711, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.435546875, | |
| "grad_norm": 16.09569549560547, | |
| "learning_rate": 2.609049479166667e-05, | |
| "loss": 2.8913, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.4453125, | |
| "grad_norm": 10.328545570373535, | |
| "learning_rate": 2.5927734375e-05, | |
| "loss": 2.9569, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.455078125, | |
| "grad_norm": 10.365655899047852, | |
| "learning_rate": 2.576497395833333e-05, | |
| "loss": 2.9558, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.46484375, | |
| "grad_norm": 11.824304580688477, | |
| "learning_rate": 2.560221354166667e-05, | |
| "loss": 2.906, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.474609375, | |
| "grad_norm": 13.518112182617188, | |
| "learning_rate": 2.5439453125e-05, | |
| "loss": 2.9135, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 13.241069793701172, | |
| "learning_rate": 2.527669270833333e-05, | |
| "loss": 2.8683, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.494140625, | |
| "grad_norm": 10.397425651550293, | |
| "learning_rate": 2.5115559895833336e-05, | |
| "loss": 2.9151, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.50390625, | |
| "grad_norm": 9.971900939941406, | |
| "learning_rate": 2.495279947916667e-05, | |
| "loss": 2.8387, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.513671875, | |
| "grad_norm": 12.148921966552734, | |
| "learning_rate": 2.47900390625e-05, | |
| "loss": 3.0045, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.5234375, | |
| "grad_norm": 12.122273445129395, | |
| "learning_rate": 2.4627278645833336e-05, | |
| "loss": 2.8201, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.533203125, | |
| "grad_norm": 10.754613876342773, | |
| "learning_rate": 2.446451822916667e-05, | |
| "loss": 3.0412, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.54296875, | |
| "grad_norm": 12.5385103225708, | |
| "learning_rate": 2.43017578125e-05, | |
| "loss": 2.9698, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.552734375, | |
| "grad_norm": 12.530341148376465, | |
| "learning_rate": 2.4138997395833335e-05, | |
| "loss": 2.9844, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 12.995160102844238, | |
| "learning_rate": 2.397623697916667e-05, | |
| "loss": 2.9505, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.572265625, | |
| "grad_norm": 11.06143856048584, | |
| "learning_rate": 2.38134765625e-05, | |
| "loss": 2.9299, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.58203125, | |
| "grad_norm": 11.600089073181152, | |
| "learning_rate": 2.3650716145833335e-05, | |
| "loss": 2.9184, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.591796875, | |
| "grad_norm": 9.294841766357422, | |
| "learning_rate": 2.348795572916667e-05, | |
| "loss": 3.0312, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.6015625, | |
| "grad_norm": 13.555610656738281, | |
| "learning_rate": 2.33251953125e-05, | |
| "loss": 2.899, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.611328125, | |
| "grad_norm": 11.077611923217773, | |
| "learning_rate": 2.3162434895833334e-05, | |
| "loss": 2.9032, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.62109375, | |
| "grad_norm": 12.092378616333008, | |
| "learning_rate": 2.299967447916667e-05, | |
| "loss": 3.0475, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.630859375, | |
| "grad_norm": 18.95319175720215, | |
| "learning_rate": 2.28369140625e-05, | |
| "loss": 2.8485, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 20.765520095825195, | |
| "learning_rate": 2.2674153645833334e-05, | |
| "loss": 3.0211, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.650390625, | |
| "grad_norm": 19.739500045776367, | |
| "learning_rate": 2.2511393229166668e-05, | |
| "loss": 3.0398, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.66015625, | |
| "grad_norm": 12.64313793182373, | |
| "learning_rate": 2.23486328125e-05, | |
| "loss": 2.8028, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.669921875, | |
| "grad_norm": 19.063640594482422, | |
| "learning_rate": 2.2185872395833333e-05, | |
| "loss": 2.896, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.6796875, | |
| "grad_norm": 23.34886932373047, | |
| "learning_rate": 2.2023111979166668e-05, | |
| "loss": 2.9162, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.689453125, | |
| "grad_norm": 11.44904899597168, | |
| "learning_rate": 2.1860351562500002e-05, | |
| "loss": 2.9423, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.69921875, | |
| "grad_norm": 16.1793155670166, | |
| "learning_rate": 2.1697591145833336e-05, | |
| "loss": 2.917, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.708984375, | |
| "grad_norm": 9.111202239990234, | |
| "learning_rate": 2.1534830729166667e-05, | |
| "loss": 2.9147, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 12.297972679138184, | |
| "learning_rate": 2.13720703125e-05, | |
| "loss": 2.8541, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.728515625, | |
| "grad_norm": 14.833362579345703, | |
| "learning_rate": 2.1209309895833336e-05, | |
| "loss": 2.9548, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.73828125, | |
| "grad_norm": 11.36043643951416, | |
| "learning_rate": 2.1048177083333334e-05, | |
| "loss": 2.8732, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.748046875, | |
| "grad_norm": 12.69233512878418, | |
| "learning_rate": 2.088541666666667e-05, | |
| "loss": 2.9447, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "grad_norm": 13.44200611114502, | |
| "learning_rate": 2.072265625e-05, | |
| "loss": 2.8824, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.767578125, | |
| "grad_norm": 10.753628730773926, | |
| "learning_rate": 2.05615234375e-05, | |
| "loss": 2.8455, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.77734375, | |
| "grad_norm": 13.760424613952637, | |
| "learning_rate": 2.0398763020833335e-05, | |
| "loss": 2.9853, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.787109375, | |
| "grad_norm": 16.471168518066406, | |
| "learning_rate": 2.023600260416667e-05, | |
| "loss": 2.8303, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 13.708788871765137, | |
| "learning_rate": 2.0073242187500004e-05, | |
| "loss": 2.9086, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.806640625, | |
| "grad_norm": 13.522102355957031, | |
| "learning_rate": 1.9912109375000002e-05, | |
| "loss": 2.9567, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.81640625, | |
| "grad_norm": 14.42663288116455, | |
| "learning_rate": 1.9749348958333333e-05, | |
| "loss": 2.8666, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.826171875, | |
| "grad_norm": 10.03260326385498, | |
| "learning_rate": 1.9586588541666667e-05, | |
| "loss": 2.9193, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.8359375, | |
| "grad_norm": 10.757763862609863, | |
| "learning_rate": 1.9423828125e-05, | |
| "loss": 3.0071, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.845703125, | |
| "grad_norm": 12.414703369140625, | |
| "learning_rate": 1.9261067708333332e-05, | |
| "loss": 2.9446, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.85546875, | |
| "grad_norm": 12.182251930236816, | |
| "learning_rate": 1.9098307291666667e-05, | |
| "loss": 2.9584, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.865234375, | |
| "grad_norm": 13.588275909423828, | |
| "learning_rate": 1.8935546875e-05, | |
| "loss": 2.8303, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 11.538961410522461, | |
| "learning_rate": 1.8772786458333332e-05, | |
| "loss": 2.9129, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.884765625, | |
| "grad_norm": 14.856468200683594, | |
| "learning_rate": 1.8610026041666666e-05, | |
| "loss": 2.8229, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.89453125, | |
| "grad_norm": 12.910444259643555, | |
| "learning_rate": 1.8447265625e-05, | |
| "loss": 2.9851, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.904296875, | |
| "grad_norm": 12.187970161437988, | |
| "learning_rate": 1.828450520833333e-05, | |
| "loss": 2.863, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.9140625, | |
| "grad_norm": 16.951251983642578, | |
| "learning_rate": 1.8121744791666666e-05, | |
| "loss": 2.8726, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.923828125, | |
| "grad_norm": 10.626237869262695, | |
| "learning_rate": 1.7958984375e-05, | |
| "loss": 2.79, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.93359375, | |
| "grad_norm": 13.63193416595459, | |
| "learning_rate": 1.7796223958333334e-05, | |
| "loss": 2.8807, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.943359375, | |
| "grad_norm": 12.877738952636719, | |
| "learning_rate": 1.763346354166667e-05, | |
| "loss": 2.7945, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 12.08793830871582, | |
| "learning_rate": 1.7470703125000003e-05, | |
| "loss": 2.8932, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.962890625, | |
| "grad_norm": 12.529877662658691, | |
| "learning_rate": 1.7307942708333334e-05, | |
| "loss": 2.9178, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.97265625, | |
| "grad_norm": 12.949920654296875, | |
| "learning_rate": 1.7145182291666668e-05, | |
| "loss": 2.8207, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.982421875, | |
| "grad_norm": 16.327682495117188, | |
| "learning_rate": 1.6982421875000003e-05, | |
| "loss": 2.9098, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.9921875, | |
| "grad_norm": 13.496146202087402, | |
| "learning_rate": 1.6819661458333334e-05, | |
| "loss": 2.7364, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.001953125, | |
| "grad_norm": 18.41584014892578, | |
| "learning_rate": 1.6656901041666668e-05, | |
| "loss": 2.8233, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.01171875, | |
| "grad_norm": 14.442363739013672, | |
| "learning_rate": 1.6494140625000002e-05, | |
| "loss": 2.7373, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.021484375, | |
| "grad_norm": 13.7149019241333, | |
| "learning_rate": 1.6331380208333333e-05, | |
| "loss": 2.7693, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 17.81890296936035, | |
| "learning_rate": 1.6168619791666667e-05, | |
| "loss": 2.7528, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.041015625, | |
| "grad_norm": 11.57737922668457, | |
| "learning_rate": 1.6007486979166666e-05, | |
| "loss": 2.7167, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.05078125, | |
| "grad_norm": 12.967355728149414, | |
| "learning_rate": 1.58447265625e-05, | |
| "loss": 2.7034, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.060546875, | |
| "grad_norm": 10.983343124389648, | |
| "learning_rate": 1.568359375e-05, | |
| "loss": 2.7766, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.0703125, | |
| "grad_norm": 16.468509674072266, | |
| "learning_rate": 1.5520833333333336e-05, | |
| "loss": 2.7992, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.080078125, | |
| "grad_norm": 16.79848861694336, | |
| "learning_rate": 1.535807291666667e-05, | |
| "loss": 2.7826, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.08984375, | |
| "grad_norm": 16.537952423095703, | |
| "learning_rate": 1.5195312500000001e-05, | |
| "loss": 2.7119, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.099609375, | |
| "grad_norm": 11.28368854522705, | |
| "learning_rate": 1.5032552083333335e-05, | |
| "loss": 2.7093, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 58.54060745239258, | |
| "learning_rate": 1.4869791666666668e-05, | |
| "loss": 2.7219, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.119140625, | |
| "grad_norm": 12.917949676513672, | |
| "learning_rate": 1.470703125e-05, | |
| "loss": 2.8239, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.12890625, | |
| "grad_norm": 18.68291664123535, | |
| "learning_rate": 1.4544270833333335e-05, | |
| "loss": 2.7574, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.138671875, | |
| "grad_norm": 11.076837539672852, | |
| "learning_rate": 1.4381510416666669e-05, | |
| "loss": 2.8418, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "grad_norm": 16.057594299316406, | |
| "learning_rate": 1.421875e-05, | |
| "loss": 2.822, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.158203125, | |
| "grad_norm": 17.087045669555664, | |
| "learning_rate": 1.4055989583333334e-05, | |
| "loss": 2.7588, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.16796875, | |
| "grad_norm": 11.648773193359375, | |
| "learning_rate": 1.3893229166666669e-05, | |
| "loss": 2.717, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.177734375, | |
| "grad_norm": 12.739274024963379, | |
| "learning_rate": 1.373046875e-05, | |
| "loss": 2.6956, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 10.17639446258545, | |
| "learning_rate": 1.3567708333333334e-05, | |
| "loss": 2.8241, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.197265625, | |
| "grad_norm": 13.572341918945312, | |
| "learning_rate": 1.3404947916666668e-05, | |
| "loss": 2.7858, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.20703125, | |
| "grad_norm": 14.310699462890625, | |
| "learning_rate": 1.3242187500000001e-05, | |
| "loss": 2.6909, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.216796875, | |
| "grad_norm": 11.991633415222168, | |
| "learning_rate": 1.3079427083333335e-05, | |
| "loss": 2.7708, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.2265625, | |
| "grad_norm": 14.214717864990234, | |
| "learning_rate": 1.2916666666666668e-05, | |
| "loss": 2.7603, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.236328125, | |
| "grad_norm": 16.019987106323242, | |
| "learning_rate": 1.275390625e-05, | |
| "loss": 2.7493, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.24609375, | |
| "grad_norm": 11.817761421203613, | |
| "learning_rate": 1.2591145833333335e-05, | |
| "loss": 2.7647, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.255859375, | |
| "grad_norm": 16.247276306152344, | |
| "learning_rate": 1.2430013020833335e-05, | |
| "loss": 2.7605, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 11.79404067993164, | |
| "learning_rate": 1.2267252604166667e-05, | |
| "loss": 2.7058, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.275390625, | |
| "grad_norm": 12.77724838256836, | |
| "learning_rate": 1.2104492187500001e-05, | |
| "loss": 2.7446, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.28515625, | |
| "grad_norm": 11.609589576721191, | |
| "learning_rate": 1.1941731770833334e-05, | |
| "loss": 2.7794, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.294921875, | |
| "grad_norm": 13.240425109863281, | |
| "learning_rate": 1.1780598958333334e-05, | |
| "loss": 2.8056, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.3046875, | |
| "grad_norm": 15.682677268981934, | |
| "learning_rate": 1.1617838541666668e-05, | |
| "loss": 2.7102, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.314453125, | |
| "grad_norm": 15.334482192993164, | |
| "learning_rate": 1.1455078125e-05, | |
| "loss": 2.8026, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.32421875, | |
| "grad_norm": 11.944014549255371, | |
| "learning_rate": 1.1292317708333335e-05, | |
| "loss": 2.7106, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.333984375, | |
| "grad_norm": 13.437361717224121, | |
| "learning_rate": 1.1129557291666668e-05, | |
| "loss": 2.6837, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 15.150136947631836, | |
| "learning_rate": 1.0966796875e-05, | |
| "loss": 2.7095, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.353515625, | |
| "grad_norm": 13.133088111877441, | |
| "learning_rate": 1.0804036458333335e-05, | |
| "loss": 2.7609, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 2.36328125, | |
| "grad_norm": 12.005653381347656, | |
| "learning_rate": 1.0641276041666667e-05, | |
| "loss": 2.7086, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.373046875, | |
| "grad_norm": 20.258712768554688, | |
| "learning_rate": 1.0478515625e-05, | |
| "loss": 2.8082, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 2.3828125, | |
| "grad_norm": 14.602194786071777, | |
| "learning_rate": 1.0315755208333334e-05, | |
| "loss": 2.6246, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.392578125, | |
| "grad_norm": 13.00714111328125, | |
| "learning_rate": 1.0152994791666667e-05, | |
| "loss": 2.8249, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.40234375, | |
| "grad_norm": 11.645508766174316, | |
| "learning_rate": 9.990234375e-06, | |
| "loss": 2.7555, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.412109375, | |
| "grad_norm": 17.61017417907715, | |
| "learning_rate": 9.827473958333334e-06, | |
| "loss": 2.6942, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 12.34157943725586, | |
| "learning_rate": 9.664713541666668e-06, | |
| "loss": 2.7215, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.431640625, | |
| "grad_norm": 12.765501976013184, | |
| "learning_rate": 9.501953125e-06, | |
| "loss": 2.5747, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 2.44140625, | |
| "grad_norm": 12.85317611694336, | |
| "learning_rate": 9.339192708333335e-06, | |
| "loss": 2.7917, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.451171875, | |
| "grad_norm": 12.610406875610352, | |
| "learning_rate": 9.176432291666668e-06, | |
| "loss": 2.8635, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 2.4609375, | |
| "grad_norm": 12.031999588012695, | |
| "learning_rate": 9.013671875e-06, | |
| "loss": 2.6288, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.470703125, | |
| "grad_norm": 10.63759994506836, | |
| "learning_rate": 8.850911458333335e-06, | |
| "loss": 2.7396, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 2.48046875, | |
| "grad_norm": 18.768993377685547, | |
| "learning_rate": 8.688151041666667e-06, | |
| "loss": 2.6733, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.490234375, | |
| "grad_norm": 9.411011695861816, | |
| "learning_rate": 8.525390625e-06, | |
| "loss": 2.7355, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 11.31039810180664, | |
| "learning_rate": 8.362630208333334e-06, | |
| "loss": 2.6648, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.509765625, | |
| "grad_norm": 11.34940242767334, | |
| "learning_rate": 8.199869791666667e-06, | |
| "loss": 2.6185, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 2.51953125, | |
| "grad_norm": 13.539913177490234, | |
| "learning_rate": 8.037109375e-06, | |
| "loss": 2.6344, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.529296875, | |
| "grad_norm": 12.985583305358887, | |
| "learning_rate": 7.874348958333334e-06, | |
| "loss": 2.7028, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "grad_norm": 23.021692276000977, | |
| "learning_rate": 7.711588541666666e-06, | |
| "loss": 2.6719, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.548828125, | |
| "grad_norm": 14.796003341674805, | |
| "learning_rate": 7.548828125e-06, | |
| "loss": 2.669, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.55859375, | |
| "grad_norm": 16.303749084472656, | |
| "learning_rate": 7.386067708333334e-06, | |
| "loss": 2.5954, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.568359375, | |
| "grad_norm": 10.631623268127441, | |
| "learning_rate": 7.223307291666667e-06, | |
| "loss": 2.669, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 13.405618667602539, | |
| "learning_rate": 7.060546875e-06, | |
| "loss": 2.7102, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.587890625, | |
| "grad_norm": 16.89972496032715, | |
| "learning_rate": 6.897786458333335e-06, | |
| "loss": 2.7198, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.59765625, | |
| "grad_norm": 14.60909652709961, | |
| "learning_rate": 6.735026041666667e-06, | |
| "loss": 2.686, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.607421875, | |
| "grad_norm": 16.859573364257812, | |
| "learning_rate": 6.572265625e-06, | |
| "loss": 2.7073, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.6171875, | |
| "grad_norm": 12.876221656799316, | |
| "learning_rate": 6.409505208333334e-06, | |
| "loss": 2.66, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.626953125, | |
| "grad_norm": 12.116389274597168, | |
| "learning_rate": 6.246744791666667e-06, | |
| "loss": 2.6773, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.63671875, | |
| "grad_norm": 13.397444725036621, | |
| "learning_rate": 6.083984375e-06, | |
| "loss": 2.5832, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.646484375, | |
| "grad_norm": 14.27937126159668, | |
| "learning_rate": 5.921223958333334e-06, | |
| "loss": 2.7137, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 12.069489479064941, | |
| "learning_rate": 5.758463541666667e-06, | |
| "loss": 2.6435, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.666015625, | |
| "grad_norm": 11.179854393005371, | |
| "learning_rate": 5.595703125e-06, | |
| "loss": 2.707, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.67578125, | |
| "grad_norm": 11.071802139282227, | |
| "learning_rate": 5.432942708333333e-06, | |
| "loss": 2.6166, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.685546875, | |
| "grad_norm": 14.306278228759766, | |
| "learning_rate": 5.270182291666667e-06, | |
| "loss": 2.6747, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.6953125, | |
| "grad_norm": 14.3062744140625, | |
| "learning_rate": 5.107421875e-06, | |
| "loss": 2.5382, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.705078125, | |
| "grad_norm": 14.975716590881348, | |
| "learning_rate": 4.944661458333334e-06, | |
| "loss": 2.7215, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.71484375, | |
| "grad_norm": 14.584077835083008, | |
| "learning_rate": 4.781901041666667e-06, | |
| "loss": 2.7351, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.724609375, | |
| "grad_norm": 11.181657791137695, | |
| "learning_rate": 4.619140625e-06, | |
| "loss": 2.576, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 10.974489212036133, | |
| "learning_rate": 4.456380208333333e-06, | |
| "loss": 2.7631, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.744140625, | |
| "grad_norm": 15.731523513793945, | |
| "learning_rate": 4.295247395833334e-06, | |
| "loss": 2.5512, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 2.75390625, | |
| "grad_norm": 12.23338794708252, | |
| "learning_rate": 4.132486979166667e-06, | |
| "loss": 2.6233, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.763671875, | |
| "grad_norm": 9.420981407165527, | |
| "learning_rate": 3.9697265625e-06, | |
| "loss": 2.6464, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 2.7734375, | |
| "grad_norm": 10.947881698608398, | |
| "learning_rate": 3.8069661458333335e-06, | |
| "loss": 2.6197, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.783203125, | |
| "grad_norm": 17.761030197143555, | |
| "learning_rate": 3.644205729166667e-06, | |
| "loss": 2.6341, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.79296875, | |
| "grad_norm": 15.768482208251953, | |
| "learning_rate": 3.4814453125e-06, | |
| "loss": 2.653, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.802734375, | |
| "grad_norm": 13.388958930969238, | |
| "learning_rate": 3.3186848958333335e-06, | |
| "loss": 2.5511, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 11.864120483398438, | |
| "learning_rate": 3.155924479166667e-06, | |
| "loss": 2.7781, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.822265625, | |
| "grad_norm": 11.44416618347168, | |
| "learning_rate": 2.9931640625e-06, | |
| "loss": 2.6667, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.83203125, | |
| "grad_norm": 13.012479782104492, | |
| "learning_rate": 2.8304036458333335e-06, | |
| "loss": 2.6217, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.841796875, | |
| "grad_norm": 14.73644733428955, | |
| "learning_rate": 2.667643229166667e-06, | |
| "loss": 2.606, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.8515625, | |
| "grad_norm": 12.075024604797363, | |
| "learning_rate": 2.5048828125e-06, | |
| "loss": 2.5863, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.861328125, | |
| "grad_norm": 14.664973258972168, | |
| "learning_rate": 2.3421223958333335e-06, | |
| "loss": 2.7423, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.87109375, | |
| "grad_norm": 14.096378326416016, | |
| "learning_rate": 2.179361979166667e-06, | |
| "loss": 2.7122, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.880859375, | |
| "grad_norm": 12.812738418579102, | |
| "learning_rate": 2.0166015625e-06, | |
| "loss": 2.7648, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.890625, | |
| "grad_norm": 16.189590454101562, | |
| "learning_rate": 1.8538411458333335e-06, | |
| "loss": 2.669, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.900390625, | |
| "grad_norm": 14.232686996459961, | |
| "learning_rate": 1.6910807291666667e-06, | |
| "loss": 2.6242, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.91015625, | |
| "grad_norm": 16.499113082885742, | |
| "learning_rate": 1.5283203125000002e-06, | |
| "loss": 2.7261, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.919921875, | |
| "grad_norm": 10.421613693237305, | |
| "learning_rate": 1.3655598958333332e-06, | |
| "loss": 2.5805, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.9296875, | |
| "grad_norm": 11.867895126342773, | |
| "learning_rate": 1.2027994791666667e-06, | |
| "loss": 2.6225, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.939453125, | |
| "grad_norm": 14.396392822265625, | |
| "learning_rate": 1.0400390625000002e-06, | |
| "loss": 2.591, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.94921875, | |
| "grad_norm": 26.637563705444336, | |
| "learning_rate": 8.772786458333333e-07, | |
| "loss": 2.6324, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.958984375, | |
| "grad_norm": 13.43743896484375, | |
| "learning_rate": 7.145182291666667e-07, | |
| "loss": 2.7171, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 13.423970222473145, | |
| "learning_rate": 5.517578125000001e-07, | |
| "loss": 2.5509, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.978515625, | |
| "grad_norm": 12.26883316040039, | |
| "learning_rate": 3.889973958333334e-07, | |
| "loss": 2.6303, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.98828125, | |
| "grad_norm": 13.305875778198242, | |
| "learning_rate": 2.262369791666667e-07, | |
| "loss": 2.6542, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.998046875, | |
| "grad_norm": 18.230594635009766, | |
| "learning_rate": 6.347656250000001e-08, | |
| "loss": 2.6088, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 30720, | |
| "total_flos": 4.206282006594048e+16, | |
| "train_loss": 3.0830205624302227, | |
| "train_runtime": 6816.3036, | |
| "train_samples_per_second": 72.109, | |
| "train_steps_per_second": 4.507 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 30720, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.206282006594048e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |