trpg_mlm / trainer_state.json
Mango-Juice's picture
Upload 12 files
568b908 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 30720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009765625,
"grad_norm": NaN,
"learning_rate": 4.984375e-05,
"loss": 6.5469,
"step": 100
},
{
"epoch": 0.01953125,
"grad_norm": 15.062654495239258,
"learning_rate": 4.9682617187500003e-05,
"loss": 5.3177,
"step": 200
},
{
"epoch": 0.029296875,
"grad_norm": 13.20622730255127,
"learning_rate": 4.9519856770833334e-05,
"loss": 4.8951,
"step": 300
},
{
"epoch": 0.0390625,
"grad_norm": 14.794780731201172,
"learning_rate": 4.935709635416667e-05,
"loss": 4.8799,
"step": 400
},
{
"epoch": 0.048828125,
"grad_norm": 13.915122985839844,
"learning_rate": 4.919596354166667e-05,
"loss": 4.5381,
"step": 500
},
{
"epoch": 0.05859375,
"grad_norm": 15.835251808166504,
"learning_rate": 4.9033203125000005e-05,
"loss": 4.4279,
"step": 600
},
{
"epoch": 0.068359375,
"grad_norm": 14.109216690063477,
"learning_rate": 4.8870442708333335e-05,
"loss": 4.2612,
"step": 700
},
{
"epoch": 0.078125,
"grad_norm": 13.910305976867676,
"learning_rate": 4.8707682291666666e-05,
"loss": 4.2912,
"step": 800
},
{
"epoch": 0.087890625,
"grad_norm": 15.284989356994629,
"learning_rate": 4.854654947916667e-05,
"loss": 4.3003,
"step": 900
},
{
"epoch": 0.09765625,
"grad_norm": 16.042747497558594,
"learning_rate": 4.83837890625e-05,
"loss": 4.0609,
"step": 1000
},
{
"epoch": 0.107421875,
"grad_norm": 12.1378755569458,
"learning_rate": 4.8221028645833336e-05,
"loss": 4.1666,
"step": 1100
},
{
"epoch": 0.1171875,
"grad_norm": 13.27160930633545,
"learning_rate": 4.805826822916667e-05,
"loss": 4.1419,
"step": 1200
},
{
"epoch": 0.126953125,
"grad_norm": 16.022241592407227,
"learning_rate": 4.78955078125e-05,
"loss": 4.1586,
"step": 1300
},
{
"epoch": 0.13671875,
"grad_norm": 17.19588851928711,
"learning_rate": 4.7732747395833336e-05,
"loss": 3.9012,
"step": 1400
},
{
"epoch": 0.146484375,
"grad_norm": 13.319019317626953,
"learning_rate": 4.756998697916667e-05,
"loss": 4.017,
"step": 1500
},
{
"epoch": 0.15625,
"grad_norm": 18.409399032592773,
"learning_rate": 4.74072265625e-05,
"loss": 3.9484,
"step": 1600
},
{
"epoch": 0.166015625,
"grad_norm": 13.017057418823242,
"learning_rate": 4.7244466145833336e-05,
"loss": 3.9405,
"step": 1700
},
{
"epoch": 0.17578125,
"grad_norm": 20.20134162902832,
"learning_rate": 4.7081705729166667e-05,
"loss": 3.8163,
"step": 1800
},
{
"epoch": 0.185546875,
"grad_norm": 12.69709587097168,
"learning_rate": 4.6918945312500004e-05,
"loss": 3.8031,
"step": 1900
},
{
"epoch": 0.1953125,
"grad_norm": 15.624823570251465,
"learning_rate": 4.6756184895833335e-05,
"loss": 3.862,
"step": 2000
},
{
"epoch": 0.205078125,
"grad_norm": 18.287826538085938,
"learning_rate": 4.659342447916667e-05,
"loss": 3.8113,
"step": 2100
},
{
"epoch": 0.21484375,
"grad_norm": 13.958403587341309,
"learning_rate": 4.6430664062500004e-05,
"loss": 3.7175,
"step": 2200
},
{
"epoch": 0.224609375,
"grad_norm": 14.171894073486328,
"learning_rate": 4.6267903645833335e-05,
"loss": 3.8023,
"step": 2300
},
{
"epoch": 0.234375,
"grad_norm": 13.721501350402832,
"learning_rate": 4.610514322916667e-05,
"loss": 3.7811,
"step": 2400
},
{
"epoch": 0.244140625,
"grad_norm": 15.861538887023926,
"learning_rate": 4.5942382812500003e-05,
"loss": 3.6891,
"step": 2500
},
{
"epoch": 0.25390625,
"grad_norm": 12.93659496307373,
"learning_rate": 4.577962239583334e-05,
"loss": 3.6613,
"step": 2600
},
{
"epoch": 0.263671875,
"grad_norm": 14.38327407836914,
"learning_rate": 4.561686197916667e-05,
"loss": 4.0671,
"step": 2700
},
{
"epoch": 0.2734375,
"grad_norm": 21.85599136352539,
"learning_rate": 4.54541015625e-05,
"loss": 3.6713,
"step": 2800
},
{
"epoch": 0.283203125,
"grad_norm": 16.765880584716797,
"learning_rate": 4.529134114583334e-05,
"loss": 3.787,
"step": 2900
},
{
"epoch": 0.29296875,
"grad_norm": 12.277482032775879,
"learning_rate": 4.512858072916667e-05,
"loss": 3.634,
"step": 3000
},
{
"epoch": 0.302734375,
"grad_norm": 14.076525688171387,
"learning_rate": 4.49658203125e-05,
"loss": 3.6676,
"step": 3100
},
{
"epoch": 0.3125,
"grad_norm": 54.203609466552734,
"learning_rate": 4.480305989583334e-05,
"loss": 3.4946,
"step": 3200
},
{
"epoch": 0.322265625,
"grad_norm": 17.947622299194336,
"learning_rate": 4.464029947916667e-05,
"loss": 3.6398,
"step": 3300
},
{
"epoch": 0.33203125,
"grad_norm": 12.46594524383545,
"learning_rate": 4.44775390625e-05,
"loss": 3.6395,
"step": 3400
},
{
"epoch": 0.341796875,
"grad_norm": 14.41441822052002,
"learning_rate": 4.431477864583334e-05,
"loss": 3.5954,
"step": 3500
},
{
"epoch": 0.3515625,
"grad_norm": 11.247319221496582,
"learning_rate": 4.415201822916667e-05,
"loss": 3.7537,
"step": 3600
},
{
"epoch": 0.361328125,
"grad_norm": 15.799861907958984,
"learning_rate": 4.39892578125e-05,
"loss": 3.7104,
"step": 3700
},
{
"epoch": 0.37109375,
"grad_norm": 13.855748176574707,
"learning_rate": 4.382649739583334e-05,
"loss": 3.6212,
"step": 3800
},
{
"epoch": 0.380859375,
"grad_norm": 14.83035659790039,
"learning_rate": 4.366373697916667e-05,
"loss": 3.5802,
"step": 3900
},
{
"epoch": 0.390625,
"grad_norm": 13.719660758972168,
"learning_rate": 4.35009765625e-05,
"loss": 3.4662,
"step": 4000
},
{
"epoch": 0.400390625,
"grad_norm": NaN,
"learning_rate": 4.333821614583334e-05,
"loss": 3.5461,
"step": 4100
},
{
"epoch": 0.41015625,
"grad_norm": 13.233614921569824,
"learning_rate": 4.3177083333333334e-05,
"loss": 3.5612,
"step": 4200
},
{
"epoch": 0.419921875,
"grad_norm": 16.629112243652344,
"learning_rate": 4.3014322916666665e-05,
"loss": 3.4333,
"step": 4300
},
{
"epoch": 0.4296875,
"grad_norm": 84.17444610595703,
"learning_rate": 4.28515625e-05,
"loss": 3.4819,
"step": 4400
},
{
"epoch": 0.439453125,
"grad_norm": 14.212441444396973,
"learning_rate": 4.268880208333333e-05,
"loss": 3.4069,
"step": 4500
},
{
"epoch": 0.44921875,
"grad_norm": 20.573959350585938,
"learning_rate": 4.2526041666666664e-05,
"loss": 3.6456,
"step": 4600
},
{
"epoch": 0.458984375,
"grad_norm": 13.834342956542969,
"learning_rate": 4.236328125e-05,
"loss": 3.5382,
"step": 4700
},
{
"epoch": 0.46875,
"grad_norm": 14.079811096191406,
"learning_rate": 4.220052083333333e-05,
"loss": 3.4105,
"step": 4800
},
{
"epoch": 0.478515625,
"grad_norm": 12.509760856628418,
"learning_rate": 4.2037760416666664e-05,
"loss": 3.3461,
"step": 4900
},
{
"epoch": 0.48828125,
"grad_norm": 14.564861297607422,
"learning_rate": 4.1875e-05,
"loss": 3.4041,
"step": 5000
},
{
"epoch": 0.498046875,
"grad_norm": 14.737241744995117,
"learning_rate": 4.171223958333333e-05,
"loss": 3.507,
"step": 5100
},
{
"epoch": 0.5078125,
"grad_norm": 15.188499450683594,
"learning_rate": 4.154947916666666e-05,
"loss": 3.4917,
"step": 5200
},
{
"epoch": 0.517578125,
"grad_norm": 12.736299514770508,
"learning_rate": 4.138671875e-05,
"loss": 3.3674,
"step": 5300
},
{
"epoch": 0.52734375,
"grad_norm": 13.326681137084961,
"learning_rate": 4.122395833333333e-05,
"loss": 3.4286,
"step": 5400
},
{
"epoch": 0.537109375,
"grad_norm": 12.76373291015625,
"learning_rate": 4.106119791666666e-05,
"loss": 3.4353,
"step": 5500
},
{
"epoch": 0.546875,
"grad_norm": 15.256406784057617,
"learning_rate": 4.08984375e-05,
"loss": 3.4297,
"step": 5600
},
{
"epoch": 0.556640625,
"grad_norm": 11.880370140075684,
"learning_rate": 4.073567708333333e-05,
"loss": 3.2847,
"step": 5700
},
{
"epoch": 0.56640625,
"grad_norm": 15.305662155151367,
"learning_rate": 4.057291666666667e-05,
"loss": 3.3476,
"step": 5800
},
{
"epoch": 0.576171875,
"grad_norm": 24.128557205200195,
"learning_rate": 4.041015625e-05,
"loss": 3.4455,
"step": 5900
},
{
"epoch": 0.5859375,
"grad_norm": 13.900318145751953,
"learning_rate": 4.024739583333334e-05,
"loss": 3.4846,
"step": 6000
},
{
"epoch": 0.595703125,
"grad_norm": 11.240140914916992,
"learning_rate": 4.008463541666667e-05,
"loss": 3.4453,
"step": 6100
},
{
"epoch": 0.60546875,
"grad_norm": 14.209588050842285,
"learning_rate": 3.9921875e-05,
"loss": 3.3685,
"step": 6200
},
{
"epoch": 0.615234375,
"grad_norm": 16.545690536499023,
"learning_rate": 3.975911458333334e-05,
"loss": 3.3857,
"step": 6300
},
{
"epoch": 0.625,
"grad_norm": 12.255586624145508,
"learning_rate": 3.959635416666667e-05,
"loss": 3.3588,
"step": 6400
},
{
"epoch": 0.634765625,
"grad_norm": 15.788583755493164,
"learning_rate": 3.9433593750000006e-05,
"loss": 3.4022,
"step": 6500
},
{
"epoch": 0.64453125,
"grad_norm": 11.233269691467285,
"learning_rate": 3.92724609375e-05,
"loss": 3.3202,
"step": 6600
},
{
"epoch": 0.654296875,
"grad_norm": 16.585798263549805,
"learning_rate": 3.910970052083334e-05,
"loss": 3.2925,
"step": 6700
},
{
"epoch": 0.6640625,
"grad_norm": 15.305195808410645,
"learning_rate": 3.894694010416667e-05,
"loss": 3.3162,
"step": 6800
},
{
"epoch": 0.673828125,
"grad_norm": 11.933198928833008,
"learning_rate": 3.87841796875e-05,
"loss": 3.3133,
"step": 6900
},
{
"epoch": 0.68359375,
"grad_norm": 15.607390403747559,
"learning_rate": 3.862141927083334e-05,
"loss": 3.3407,
"step": 7000
},
{
"epoch": 0.693359375,
"grad_norm": 9.2060546875,
"learning_rate": 3.845865885416667e-05,
"loss": 3.3973,
"step": 7100
},
{
"epoch": 0.703125,
"grad_norm": 17.788448333740234,
"learning_rate": 3.82958984375e-05,
"loss": 3.261,
"step": 7200
},
{
"epoch": 0.712890625,
"grad_norm": 13.37771224975586,
"learning_rate": 3.813313802083334e-05,
"loss": 3.3159,
"step": 7300
},
{
"epoch": 0.72265625,
"grad_norm": 13.427688598632812,
"learning_rate": 3.797037760416667e-05,
"loss": 3.2451,
"step": 7400
},
{
"epoch": 0.732421875,
"grad_norm": 12.314142227172852,
"learning_rate": 3.78076171875e-05,
"loss": 3.2721,
"step": 7500
},
{
"epoch": 0.7421875,
"grad_norm": 11.418351173400879,
"learning_rate": 3.764485677083334e-05,
"loss": 3.2943,
"step": 7600
},
{
"epoch": 0.751953125,
"grad_norm": 16.546539306640625,
"learning_rate": 3.748209635416667e-05,
"loss": 3.2243,
"step": 7700
},
{
"epoch": 0.76171875,
"grad_norm": 16.984668731689453,
"learning_rate": 3.73193359375e-05,
"loss": 3.3024,
"step": 7800
},
{
"epoch": 0.771484375,
"grad_norm": 11.702521324157715,
"learning_rate": 3.715657552083334e-05,
"loss": 3.1967,
"step": 7900
},
{
"epoch": 0.78125,
"grad_norm": 12.537822723388672,
"learning_rate": 3.699381510416667e-05,
"loss": 3.2535,
"step": 8000
},
{
"epoch": 0.791015625,
"grad_norm": 13.640584945678711,
"learning_rate": 3.68310546875e-05,
"loss": 3.1677,
"step": 8100
},
{
"epoch": 0.80078125,
"grad_norm": 15.423649787902832,
"learning_rate": 3.6668294270833336e-05,
"loss": 3.2641,
"step": 8200
},
{
"epoch": 0.810546875,
"grad_norm": 12.065776824951172,
"learning_rate": 3.650553385416667e-05,
"loss": 3.1169,
"step": 8300
},
{
"epoch": 0.8203125,
"grad_norm": 14.259243965148926,
"learning_rate": 3.63427734375e-05,
"loss": 3.2357,
"step": 8400
},
{
"epoch": 0.830078125,
"grad_norm": 17.08042335510254,
"learning_rate": 3.6180013020833336e-05,
"loss": 3.3104,
"step": 8500
},
{
"epoch": 0.83984375,
"grad_norm": 10.867400169372559,
"learning_rate": 3.601725260416667e-05,
"loss": 3.296,
"step": 8600
},
{
"epoch": 0.849609375,
"grad_norm": 13.106012344360352,
"learning_rate": 3.58544921875e-05,
"loss": 3.2956,
"step": 8700
},
{
"epoch": 0.859375,
"grad_norm": 15.86103630065918,
"learning_rate": 3.5691731770833335e-05,
"loss": 3.2569,
"step": 8800
},
{
"epoch": 0.869140625,
"grad_norm": 11.86782169342041,
"learning_rate": 3.5528971354166666e-05,
"loss": 3.1775,
"step": 8900
},
{
"epoch": 0.87890625,
"grad_norm": 13.925124168395996,
"learning_rate": 3.53662109375e-05,
"loss": 3.1556,
"step": 9000
},
{
"epoch": 0.888671875,
"grad_norm": 15.298331260681152,
"learning_rate": 3.5203450520833335e-05,
"loss": 3.2081,
"step": 9100
},
{
"epoch": 0.8984375,
"grad_norm": 12.989383697509766,
"learning_rate": 3.5040690104166666e-05,
"loss": 3.2758,
"step": 9200
},
{
"epoch": 0.908203125,
"grad_norm": 12.797562599182129,
"learning_rate": 3.48779296875e-05,
"loss": 3.0895,
"step": 9300
},
{
"epoch": 0.91796875,
"grad_norm": 10.773366928100586,
"learning_rate": 3.4715169270833335e-05,
"loss": 3.1352,
"step": 9400
},
{
"epoch": 0.927734375,
"grad_norm": 13.436513900756836,
"learning_rate": 3.4552408854166665e-05,
"loss": 3.2934,
"step": 9500
},
{
"epoch": 0.9375,
"grad_norm": 12.20578670501709,
"learning_rate": 3.4389648437499996e-05,
"loss": 3.22,
"step": 9600
},
{
"epoch": 0.947265625,
"grad_norm": 15.03205680847168,
"learning_rate": 3.4226888020833334e-05,
"loss": 3.1925,
"step": 9700
},
{
"epoch": 0.95703125,
"grad_norm": 15.373735427856445,
"learning_rate": 3.4064127604166665e-05,
"loss": 3.1056,
"step": 9800
},
{
"epoch": 0.966796875,
"grad_norm": 13.795890808105469,
"learning_rate": 3.39013671875e-05,
"loss": 3.1484,
"step": 9900
},
{
"epoch": 0.9765625,
"grad_norm": 15.731973648071289,
"learning_rate": 3.3738606770833334e-05,
"loss": 3.2102,
"step": 10000
},
{
"epoch": 0.986328125,
"grad_norm": 14.516192436218262,
"learning_rate": 3.3575846354166665e-05,
"loss": 3.1795,
"step": 10100
},
{
"epoch": 0.99609375,
"grad_norm": 11.511063575744629,
"learning_rate": 3.34130859375e-05,
"loss": 3.1479,
"step": 10200
},
{
"epoch": 1.005859375,
"grad_norm": 13.947763442993164,
"learning_rate": 3.325032552083333e-05,
"loss": 3.1697,
"step": 10300
},
{
"epoch": 1.015625,
"grad_norm": 9.85244369506836,
"learning_rate": 3.308756510416667e-05,
"loss": 3.1739,
"step": 10400
},
{
"epoch": 1.025390625,
"grad_norm": 21.973459243774414,
"learning_rate": 3.29248046875e-05,
"loss": 2.9587,
"step": 10500
},
{
"epoch": 1.03515625,
"grad_norm": 11.420741081237793,
"learning_rate": 3.276204427083334e-05,
"loss": 3.1121,
"step": 10600
},
{
"epoch": 1.044921875,
"grad_norm": 10.605298042297363,
"learning_rate": 3.259928385416667e-05,
"loss": 3.0928,
"step": 10700
},
{
"epoch": 1.0546875,
"grad_norm": 15.158004760742188,
"learning_rate": 3.24365234375e-05,
"loss": 3.0272,
"step": 10800
},
{
"epoch": 1.064453125,
"grad_norm": 12.386876106262207,
"learning_rate": 3.227376302083334e-05,
"loss": 3.055,
"step": 10900
},
{
"epoch": 1.07421875,
"grad_norm": 14.404585838317871,
"learning_rate": 3.211100260416667e-05,
"loss": 3.0203,
"step": 11000
},
{
"epoch": 1.083984375,
"grad_norm": 14.187972068786621,
"learning_rate": 3.19482421875e-05,
"loss": 3.1315,
"step": 11100
},
{
"epoch": 1.09375,
"grad_norm": 11.506830215454102,
"learning_rate": 3.178548177083334e-05,
"loss": 3.0323,
"step": 11200
},
{
"epoch": 1.103515625,
"grad_norm": 14.504237174987793,
"learning_rate": 3.162272135416667e-05,
"loss": 2.9987,
"step": 11300
},
{
"epoch": 1.11328125,
"grad_norm": 12.95081615447998,
"learning_rate": 3.14599609375e-05,
"loss": 2.9773,
"step": 11400
},
{
"epoch": 1.123046875,
"grad_norm": 12.512577056884766,
"learning_rate": 3.129720052083334e-05,
"loss": 3.1454,
"step": 11500
},
{
"epoch": 1.1328125,
"grad_norm": 11.951395988464355,
"learning_rate": 3.113444010416667e-05,
"loss": 2.9314,
"step": 11600
},
{
"epoch": 1.142578125,
"grad_norm": 16.119001388549805,
"learning_rate": 3.09716796875e-05,
"loss": 2.9137,
"step": 11700
},
{
"epoch": 1.15234375,
"grad_norm": 10.542379379272461,
"learning_rate": 3.080891927083334e-05,
"loss": 3.1176,
"step": 11800
},
{
"epoch": 1.162109375,
"grad_norm": 14.544652938842773,
"learning_rate": 3.064615885416667e-05,
"loss": 2.9937,
"step": 11900
},
{
"epoch": 1.171875,
"grad_norm": 11.01784896850586,
"learning_rate": 3.04833984375e-05,
"loss": 3.0301,
"step": 12000
},
{
"epoch": 1.181640625,
"grad_norm": 12.359283447265625,
"learning_rate": 3.0320638020833337e-05,
"loss": 2.9277,
"step": 12100
},
{
"epoch": 1.19140625,
"grad_norm": 18.171897888183594,
"learning_rate": 3.015787760416667e-05,
"loss": 3.0673,
"step": 12200
},
{
"epoch": 1.201171875,
"grad_norm": 12.454042434692383,
"learning_rate": 2.99951171875e-05,
"loss": 2.9398,
"step": 12300
},
{
"epoch": 1.2109375,
"grad_norm": 12.611340522766113,
"learning_rate": 2.9832356770833337e-05,
"loss": 2.926,
"step": 12400
},
{
"epoch": 1.220703125,
"grad_norm": 12.207980155944824,
"learning_rate": 2.9669596354166668e-05,
"loss": 2.9713,
"step": 12500
},
{
"epoch": 1.23046875,
"grad_norm": 11.924321174621582,
"learning_rate": 2.95068359375e-05,
"loss": 3.0734,
"step": 12600
},
{
"epoch": 1.240234375,
"grad_norm": 10.981707572937012,
"learning_rate": 2.9344075520833337e-05,
"loss": 2.983,
"step": 12700
},
{
"epoch": 1.25,
"grad_norm": 9.699291229248047,
"learning_rate": 2.9181315104166667e-05,
"loss": 3.0235,
"step": 12800
},
{
"epoch": 1.259765625,
"grad_norm": 11.410511016845703,
"learning_rate": 2.90185546875e-05,
"loss": 3.081,
"step": 12900
},
{
"epoch": 1.26953125,
"grad_norm": 20.204944610595703,
"learning_rate": 2.8855794270833336e-05,
"loss": 2.958,
"step": 13000
},
{
"epoch": 1.279296875,
"grad_norm": 9.388766288757324,
"learning_rate": 2.8693033854166667e-05,
"loss": 3.0007,
"step": 13100
},
{
"epoch": 1.2890625,
"grad_norm": 11.05562973022461,
"learning_rate": 2.8530273437499998e-05,
"loss": 3.0186,
"step": 13200
},
{
"epoch": 1.298828125,
"grad_norm": 12.052275657653809,
"learning_rate": 2.8367513020833336e-05,
"loss": 2.9891,
"step": 13300
},
{
"epoch": 1.30859375,
"grad_norm": 17.93643569946289,
"learning_rate": 2.8204752604166667e-05,
"loss": 2.9863,
"step": 13400
},
{
"epoch": 1.318359375,
"grad_norm": 16.745187759399414,
"learning_rate": 2.8041992187499998e-05,
"loss": 2.963,
"step": 13500
},
{
"epoch": 1.328125,
"grad_norm": 11.541468620300293,
"learning_rate": 2.7879231770833335e-05,
"loss": 3.0955,
"step": 13600
},
{
"epoch": 1.337890625,
"grad_norm": 12.31360912322998,
"learning_rate": 2.7716471354166666e-05,
"loss": 3.0445,
"step": 13700
},
{
"epoch": 1.34765625,
"grad_norm": 12.349579811096191,
"learning_rate": 2.75537109375e-05,
"loss": 2.9165,
"step": 13800
},
{
"epoch": 1.357421875,
"grad_norm": 11.030657768249512,
"learning_rate": 2.7390950520833335e-05,
"loss": 2.9315,
"step": 13900
},
{
"epoch": 1.3671875,
"grad_norm": 14.333861351013184,
"learning_rate": 2.722819010416667e-05,
"loss": 3.0059,
"step": 14000
},
{
"epoch": 1.376953125,
"grad_norm": 15.17336368560791,
"learning_rate": 2.70654296875e-05,
"loss": 3.0364,
"step": 14100
},
{
"epoch": 1.38671875,
"grad_norm": 14.133316040039062,
"learning_rate": 2.6902669270833338e-05,
"loss": 3.0579,
"step": 14200
},
{
"epoch": 1.396484375,
"grad_norm": 11.148407936096191,
"learning_rate": 2.673990885416667e-05,
"loss": 2.975,
"step": 14300
},
{
"epoch": 1.40625,
"grad_norm": 14.075923919677734,
"learning_rate": 2.65771484375e-05,
"loss": 3.0044,
"step": 14400
},
{
"epoch": 1.416015625,
"grad_norm": 14.998821258544922,
"learning_rate": 2.6414388020833337e-05,
"loss": 2.9768,
"step": 14500
},
{
"epoch": 1.42578125,
"grad_norm": 12.305022239685059,
"learning_rate": 2.6251627604166668e-05,
"loss": 2.9711,
"step": 14600
},
{
"epoch": 1.435546875,
"grad_norm": 16.09569549560547,
"learning_rate": 2.609049479166667e-05,
"loss": 2.8913,
"step": 14700
},
{
"epoch": 1.4453125,
"grad_norm": 10.328545570373535,
"learning_rate": 2.5927734375e-05,
"loss": 2.9569,
"step": 14800
},
{
"epoch": 1.455078125,
"grad_norm": 10.365655899047852,
"learning_rate": 2.576497395833333e-05,
"loss": 2.9558,
"step": 14900
},
{
"epoch": 1.46484375,
"grad_norm": 11.824304580688477,
"learning_rate": 2.560221354166667e-05,
"loss": 2.906,
"step": 15000
},
{
"epoch": 1.474609375,
"grad_norm": 13.518112182617188,
"learning_rate": 2.5439453125e-05,
"loss": 2.9135,
"step": 15100
},
{
"epoch": 1.484375,
"grad_norm": 13.241069793701172,
"learning_rate": 2.527669270833333e-05,
"loss": 2.8683,
"step": 15200
},
{
"epoch": 1.494140625,
"grad_norm": 10.397425651550293,
"learning_rate": 2.5115559895833336e-05,
"loss": 2.9151,
"step": 15300
},
{
"epoch": 1.50390625,
"grad_norm": 9.971900939941406,
"learning_rate": 2.495279947916667e-05,
"loss": 2.8387,
"step": 15400
},
{
"epoch": 1.513671875,
"grad_norm": 12.148921966552734,
"learning_rate": 2.47900390625e-05,
"loss": 3.0045,
"step": 15500
},
{
"epoch": 1.5234375,
"grad_norm": 12.122273445129395,
"learning_rate": 2.4627278645833336e-05,
"loss": 2.8201,
"step": 15600
},
{
"epoch": 1.533203125,
"grad_norm": 10.754613876342773,
"learning_rate": 2.446451822916667e-05,
"loss": 3.0412,
"step": 15700
},
{
"epoch": 1.54296875,
"grad_norm": 12.5385103225708,
"learning_rate": 2.43017578125e-05,
"loss": 2.9698,
"step": 15800
},
{
"epoch": 1.552734375,
"grad_norm": 12.530341148376465,
"learning_rate": 2.4138997395833335e-05,
"loss": 2.9844,
"step": 15900
},
{
"epoch": 1.5625,
"grad_norm": 12.995160102844238,
"learning_rate": 2.397623697916667e-05,
"loss": 2.9505,
"step": 16000
},
{
"epoch": 1.572265625,
"grad_norm": 11.06143856048584,
"learning_rate": 2.38134765625e-05,
"loss": 2.9299,
"step": 16100
},
{
"epoch": 1.58203125,
"grad_norm": 11.600089073181152,
"learning_rate": 2.3650716145833335e-05,
"loss": 2.9184,
"step": 16200
},
{
"epoch": 1.591796875,
"grad_norm": 9.294841766357422,
"learning_rate": 2.348795572916667e-05,
"loss": 3.0312,
"step": 16300
},
{
"epoch": 1.6015625,
"grad_norm": 13.555610656738281,
"learning_rate": 2.33251953125e-05,
"loss": 2.899,
"step": 16400
},
{
"epoch": 1.611328125,
"grad_norm": 11.077611923217773,
"learning_rate": 2.3162434895833334e-05,
"loss": 2.9032,
"step": 16500
},
{
"epoch": 1.62109375,
"grad_norm": 12.092378616333008,
"learning_rate": 2.299967447916667e-05,
"loss": 3.0475,
"step": 16600
},
{
"epoch": 1.630859375,
"grad_norm": 18.95319175720215,
"learning_rate": 2.28369140625e-05,
"loss": 2.8485,
"step": 16700
},
{
"epoch": 1.640625,
"grad_norm": 20.765520095825195,
"learning_rate": 2.2674153645833334e-05,
"loss": 3.0211,
"step": 16800
},
{
"epoch": 1.650390625,
"grad_norm": 19.739500045776367,
"learning_rate": 2.2511393229166668e-05,
"loss": 3.0398,
"step": 16900
},
{
"epoch": 1.66015625,
"grad_norm": 12.64313793182373,
"learning_rate": 2.23486328125e-05,
"loss": 2.8028,
"step": 17000
},
{
"epoch": 1.669921875,
"grad_norm": 19.063640594482422,
"learning_rate": 2.2185872395833333e-05,
"loss": 2.896,
"step": 17100
},
{
"epoch": 1.6796875,
"grad_norm": 23.34886932373047,
"learning_rate": 2.2023111979166668e-05,
"loss": 2.9162,
"step": 17200
},
{
"epoch": 1.689453125,
"grad_norm": 11.44904899597168,
"learning_rate": 2.1860351562500002e-05,
"loss": 2.9423,
"step": 17300
},
{
"epoch": 1.69921875,
"grad_norm": 16.1793155670166,
"learning_rate": 2.1697591145833336e-05,
"loss": 2.917,
"step": 17400
},
{
"epoch": 1.708984375,
"grad_norm": 9.111202239990234,
"learning_rate": 2.1534830729166667e-05,
"loss": 2.9147,
"step": 17500
},
{
"epoch": 1.71875,
"grad_norm": 12.297972679138184,
"learning_rate": 2.13720703125e-05,
"loss": 2.8541,
"step": 17600
},
{
"epoch": 1.728515625,
"grad_norm": 14.833362579345703,
"learning_rate": 2.1209309895833336e-05,
"loss": 2.9548,
"step": 17700
},
{
"epoch": 1.73828125,
"grad_norm": 11.36043643951416,
"learning_rate": 2.1048177083333334e-05,
"loss": 2.8732,
"step": 17800
},
{
"epoch": 1.748046875,
"grad_norm": 12.69233512878418,
"learning_rate": 2.088541666666667e-05,
"loss": 2.9447,
"step": 17900
},
{
"epoch": 1.7578125,
"grad_norm": 13.44200611114502,
"learning_rate": 2.072265625e-05,
"loss": 2.8824,
"step": 18000
},
{
"epoch": 1.767578125,
"grad_norm": 10.753628730773926,
"learning_rate": 2.05615234375e-05,
"loss": 2.8455,
"step": 18100
},
{
"epoch": 1.77734375,
"grad_norm": 13.760424613952637,
"learning_rate": 2.0398763020833335e-05,
"loss": 2.9853,
"step": 18200
},
{
"epoch": 1.787109375,
"grad_norm": 16.471168518066406,
"learning_rate": 2.023600260416667e-05,
"loss": 2.8303,
"step": 18300
},
{
"epoch": 1.796875,
"grad_norm": 13.708788871765137,
"learning_rate": 2.0073242187500004e-05,
"loss": 2.9086,
"step": 18400
},
{
"epoch": 1.806640625,
"grad_norm": 13.522102355957031,
"learning_rate": 1.9912109375000002e-05,
"loss": 2.9567,
"step": 18500
},
{
"epoch": 1.81640625,
"grad_norm": 14.42663288116455,
"learning_rate": 1.9749348958333333e-05,
"loss": 2.8666,
"step": 18600
},
{
"epoch": 1.826171875,
"grad_norm": 10.03260326385498,
"learning_rate": 1.9586588541666667e-05,
"loss": 2.9193,
"step": 18700
},
{
"epoch": 1.8359375,
"grad_norm": 10.757763862609863,
"learning_rate": 1.9423828125e-05,
"loss": 3.0071,
"step": 18800
},
{
"epoch": 1.845703125,
"grad_norm": 12.414703369140625,
"learning_rate": 1.9261067708333332e-05,
"loss": 2.9446,
"step": 18900
},
{
"epoch": 1.85546875,
"grad_norm": 12.182251930236816,
"learning_rate": 1.9098307291666667e-05,
"loss": 2.9584,
"step": 19000
},
{
"epoch": 1.865234375,
"grad_norm": 13.588275909423828,
"learning_rate": 1.8935546875e-05,
"loss": 2.8303,
"step": 19100
},
{
"epoch": 1.875,
"grad_norm": 11.538961410522461,
"learning_rate": 1.8772786458333332e-05,
"loss": 2.9129,
"step": 19200
},
{
"epoch": 1.884765625,
"grad_norm": 14.856468200683594,
"learning_rate": 1.8610026041666666e-05,
"loss": 2.8229,
"step": 19300
},
{
"epoch": 1.89453125,
"grad_norm": 12.910444259643555,
"learning_rate": 1.8447265625e-05,
"loss": 2.9851,
"step": 19400
},
{
"epoch": 1.904296875,
"grad_norm": 12.187970161437988,
"learning_rate": 1.828450520833333e-05,
"loss": 2.863,
"step": 19500
},
{
"epoch": 1.9140625,
"grad_norm": 16.951251983642578,
"learning_rate": 1.8121744791666666e-05,
"loss": 2.8726,
"step": 19600
},
{
"epoch": 1.923828125,
"grad_norm": 10.626237869262695,
"learning_rate": 1.7958984375e-05,
"loss": 2.79,
"step": 19700
},
{
"epoch": 1.93359375,
"grad_norm": 13.63193416595459,
"learning_rate": 1.7796223958333334e-05,
"loss": 2.8807,
"step": 19800
},
{
"epoch": 1.943359375,
"grad_norm": 12.877738952636719,
"learning_rate": 1.763346354166667e-05,
"loss": 2.7945,
"step": 19900
},
{
"epoch": 1.953125,
"grad_norm": 12.08793830871582,
"learning_rate": 1.7470703125000003e-05,
"loss": 2.8932,
"step": 20000
},
{
"epoch": 1.962890625,
"grad_norm": 12.529877662658691,
"learning_rate": 1.7307942708333334e-05,
"loss": 2.9178,
"step": 20100
},
{
"epoch": 1.97265625,
"grad_norm": 12.949920654296875,
"learning_rate": 1.7145182291666668e-05,
"loss": 2.8207,
"step": 20200
},
{
"epoch": 1.982421875,
"grad_norm": 16.327682495117188,
"learning_rate": 1.6982421875000003e-05,
"loss": 2.9098,
"step": 20300
},
{
"epoch": 1.9921875,
"grad_norm": 13.496146202087402,
"learning_rate": 1.6819661458333334e-05,
"loss": 2.7364,
"step": 20400
},
{
"epoch": 2.001953125,
"grad_norm": 18.41584014892578,
"learning_rate": 1.6656901041666668e-05,
"loss": 2.8233,
"step": 20500
},
{
"epoch": 2.01171875,
"grad_norm": 14.442363739013672,
"learning_rate": 1.6494140625000002e-05,
"loss": 2.7373,
"step": 20600
},
{
"epoch": 2.021484375,
"grad_norm": 13.7149019241333,
"learning_rate": 1.6331380208333333e-05,
"loss": 2.7693,
"step": 20700
},
{
"epoch": 2.03125,
"grad_norm": 17.81890296936035,
"learning_rate": 1.6168619791666667e-05,
"loss": 2.7528,
"step": 20800
},
{
"epoch": 2.041015625,
"grad_norm": 11.57737922668457,
"learning_rate": 1.6007486979166666e-05,
"loss": 2.7167,
"step": 20900
},
{
"epoch": 2.05078125,
"grad_norm": 12.967355728149414,
"learning_rate": 1.58447265625e-05,
"loss": 2.7034,
"step": 21000
},
{
"epoch": 2.060546875,
"grad_norm": 10.983343124389648,
"learning_rate": 1.568359375e-05,
"loss": 2.7766,
"step": 21100
},
{
"epoch": 2.0703125,
"grad_norm": 16.468509674072266,
"learning_rate": 1.5520833333333336e-05,
"loss": 2.7992,
"step": 21200
},
{
"epoch": 2.080078125,
"grad_norm": 16.79848861694336,
"learning_rate": 1.535807291666667e-05,
"loss": 2.7826,
"step": 21300
},
{
"epoch": 2.08984375,
"grad_norm": 16.537952423095703,
"learning_rate": 1.5195312500000001e-05,
"loss": 2.7119,
"step": 21400
},
{
"epoch": 2.099609375,
"grad_norm": 11.28368854522705,
"learning_rate": 1.5032552083333335e-05,
"loss": 2.7093,
"step": 21500
},
{
"epoch": 2.109375,
"grad_norm": 58.54060745239258,
"learning_rate": 1.4869791666666668e-05,
"loss": 2.7219,
"step": 21600
},
{
"epoch": 2.119140625,
"grad_norm": 12.917949676513672,
"learning_rate": 1.470703125e-05,
"loss": 2.8239,
"step": 21700
},
{
"epoch": 2.12890625,
"grad_norm": 18.68291664123535,
"learning_rate": 1.4544270833333335e-05,
"loss": 2.7574,
"step": 21800
},
{
"epoch": 2.138671875,
"grad_norm": 11.076837539672852,
"learning_rate": 1.4381510416666669e-05,
"loss": 2.8418,
"step": 21900
},
{
"epoch": 2.1484375,
"grad_norm": 16.057594299316406,
"learning_rate": 1.421875e-05,
"loss": 2.822,
"step": 22000
},
{
"epoch": 2.158203125,
"grad_norm": 17.087045669555664,
"learning_rate": 1.4055989583333334e-05,
"loss": 2.7588,
"step": 22100
},
{
"epoch": 2.16796875,
"grad_norm": 11.648773193359375,
"learning_rate": 1.3893229166666669e-05,
"loss": 2.717,
"step": 22200
},
{
"epoch": 2.177734375,
"grad_norm": 12.739274024963379,
"learning_rate": 1.373046875e-05,
"loss": 2.6956,
"step": 22300
},
{
"epoch": 2.1875,
"grad_norm": 10.17639446258545,
"learning_rate": 1.3567708333333334e-05,
"loss": 2.8241,
"step": 22400
},
{
"epoch": 2.197265625,
"grad_norm": 13.572341918945312,
"learning_rate": 1.3404947916666668e-05,
"loss": 2.7858,
"step": 22500
},
{
"epoch": 2.20703125,
"grad_norm": 14.310699462890625,
"learning_rate": 1.3242187500000001e-05,
"loss": 2.6909,
"step": 22600
},
{
"epoch": 2.216796875,
"grad_norm": 11.991633415222168,
"learning_rate": 1.3079427083333335e-05,
"loss": 2.7708,
"step": 22700
},
{
"epoch": 2.2265625,
"grad_norm": 14.214717864990234,
"learning_rate": 1.2916666666666668e-05,
"loss": 2.7603,
"step": 22800
},
{
"epoch": 2.236328125,
"grad_norm": 16.019987106323242,
"learning_rate": 1.275390625e-05,
"loss": 2.7493,
"step": 22900
},
{
"epoch": 2.24609375,
"grad_norm": 11.817761421203613,
"learning_rate": 1.2591145833333335e-05,
"loss": 2.7647,
"step": 23000
},
{
"epoch": 2.255859375,
"grad_norm": 16.247276306152344,
"learning_rate": 1.2430013020833335e-05,
"loss": 2.7605,
"step": 23100
},
{
"epoch": 2.265625,
"grad_norm": 11.79404067993164,
"learning_rate": 1.2267252604166667e-05,
"loss": 2.7058,
"step": 23200
},
{
"epoch": 2.275390625,
"grad_norm": 12.77724838256836,
"learning_rate": 1.2104492187500001e-05,
"loss": 2.7446,
"step": 23300
},
{
"epoch": 2.28515625,
"grad_norm": 11.609589576721191,
"learning_rate": 1.1941731770833334e-05,
"loss": 2.7794,
"step": 23400
},
{
"epoch": 2.294921875,
"grad_norm": 13.240425109863281,
"learning_rate": 1.1780598958333334e-05,
"loss": 2.8056,
"step": 23500
},
{
"epoch": 2.3046875,
"grad_norm": 15.682677268981934,
"learning_rate": 1.1617838541666668e-05,
"loss": 2.7102,
"step": 23600
},
{
"epoch": 2.314453125,
"grad_norm": 15.334482192993164,
"learning_rate": 1.1455078125e-05,
"loss": 2.8026,
"step": 23700
},
{
"epoch": 2.32421875,
"grad_norm": 11.944014549255371,
"learning_rate": 1.1292317708333335e-05,
"loss": 2.7106,
"step": 23800
},
{
"epoch": 2.333984375,
"grad_norm": 13.437361717224121,
"learning_rate": 1.1129557291666668e-05,
"loss": 2.6837,
"step": 23900
},
{
"epoch": 2.34375,
"grad_norm": 15.150136947631836,
"learning_rate": 1.0966796875e-05,
"loss": 2.7095,
"step": 24000
},
{
"epoch": 2.353515625,
"grad_norm": 13.133088111877441,
"learning_rate": 1.0804036458333335e-05,
"loss": 2.7609,
"step": 24100
},
{
"epoch": 2.36328125,
"grad_norm": 12.005653381347656,
"learning_rate": 1.0641276041666667e-05,
"loss": 2.7086,
"step": 24200
},
{
"epoch": 2.373046875,
"grad_norm": 20.258712768554688,
"learning_rate": 1.0478515625e-05,
"loss": 2.8082,
"step": 24300
},
{
"epoch": 2.3828125,
"grad_norm": 14.602194786071777,
"learning_rate": 1.0315755208333334e-05,
"loss": 2.6246,
"step": 24400
},
{
"epoch": 2.392578125,
"grad_norm": 13.00714111328125,
"learning_rate": 1.0152994791666667e-05,
"loss": 2.8249,
"step": 24500
},
{
"epoch": 2.40234375,
"grad_norm": 11.645508766174316,
"learning_rate": 9.990234375e-06,
"loss": 2.7555,
"step": 24600
},
{
"epoch": 2.412109375,
"grad_norm": 17.61017417907715,
"learning_rate": 9.827473958333334e-06,
"loss": 2.6942,
"step": 24700
},
{
"epoch": 2.421875,
"grad_norm": 12.34157943725586,
"learning_rate": 9.664713541666668e-06,
"loss": 2.7215,
"step": 24800
},
{
"epoch": 2.431640625,
"grad_norm": 12.765501976013184,
"learning_rate": 9.501953125e-06,
"loss": 2.5747,
"step": 24900
},
{
"epoch": 2.44140625,
"grad_norm": 12.85317611694336,
"learning_rate": 9.339192708333335e-06,
"loss": 2.7917,
"step": 25000
},
{
"epoch": 2.451171875,
"grad_norm": 12.610406875610352,
"learning_rate": 9.176432291666668e-06,
"loss": 2.8635,
"step": 25100
},
{
"epoch": 2.4609375,
"grad_norm": 12.031999588012695,
"learning_rate": 9.013671875e-06,
"loss": 2.6288,
"step": 25200
},
{
"epoch": 2.470703125,
"grad_norm": 10.63759994506836,
"learning_rate": 8.850911458333335e-06,
"loss": 2.7396,
"step": 25300
},
{
"epoch": 2.48046875,
"grad_norm": 18.768993377685547,
"learning_rate": 8.688151041666667e-06,
"loss": 2.6733,
"step": 25400
},
{
"epoch": 2.490234375,
"grad_norm": 9.411011695861816,
"learning_rate": 8.525390625e-06,
"loss": 2.7355,
"step": 25500
},
{
"epoch": 2.5,
"grad_norm": 11.31039810180664,
"learning_rate": 8.362630208333334e-06,
"loss": 2.6648,
"step": 25600
},
{
"epoch": 2.509765625,
"grad_norm": 11.34940242767334,
"learning_rate": 8.199869791666667e-06,
"loss": 2.6185,
"step": 25700
},
{
"epoch": 2.51953125,
"grad_norm": 13.539913177490234,
"learning_rate": 8.037109375e-06,
"loss": 2.6344,
"step": 25800
},
{
"epoch": 2.529296875,
"grad_norm": 12.985583305358887,
"learning_rate": 7.874348958333334e-06,
"loss": 2.7028,
"step": 25900
},
{
"epoch": 2.5390625,
"grad_norm": 23.021692276000977,
"learning_rate": 7.711588541666666e-06,
"loss": 2.6719,
"step": 26000
},
{
"epoch": 2.548828125,
"grad_norm": 14.796003341674805,
"learning_rate": 7.548828125e-06,
"loss": 2.669,
"step": 26100
},
{
"epoch": 2.55859375,
"grad_norm": 16.303749084472656,
"learning_rate": 7.386067708333334e-06,
"loss": 2.5954,
"step": 26200
},
{
"epoch": 2.568359375,
"grad_norm": 10.631623268127441,
"learning_rate": 7.223307291666667e-06,
"loss": 2.669,
"step": 26300
},
{
"epoch": 2.578125,
"grad_norm": 13.405618667602539,
"learning_rate": 7.060546875e-06,
"loss": 2.7102,
"step": 26400
},
{
"epoch": 2.587890625,
"grad_norm": 16.89972496032715,
"learning_rate": 6.897786458333335e-06,
"loss": 2.7198,
"step": 26500
},
{
"epoch": 2.59765625,
"grad_norm": 14.60909652709961,
"learning_rate": 6.735026041666667e-06,
"loss": 2.686,
"step": 26600
},
{
"epoch": 2.607421875,
"grad_norm": 16.859573364257812,
"learning_rate": 6.572265625e-06,
"loss": 2.7073,
"step": 26700
},
{
"epoch": 2.6171875,
"grad_norm": 12.876221656799316,
"learning_rate": 6.409505208333334e-06,
"loss": 2.66,
"step": 26800
},
{
"epoch": 2.626953125,
"grad_norm": 12.116389274597168,
"learning_rate": 6.246744791666667e-06,
"loss": 2.6773,
"step": 26900
},
{
"epoch": 2.63671875,
"grad_norm": 13.397444725036621,
"learning_rate": 6.083984375e-06,
"loss": 2.5832,
"step": 27000
},
{
"epoch": 2.646484375,
"grad_norm": 14.27937126159668,
"learning_rate": 5.921223958333334e-06,
"loss": 2.7137,
"step": 27100
},
{
"epoch": 2.65625,
"grad_norm": 12.069489479064941,
"learning_rate": 5.758463541666667e-06,
"loss": 2.6435,
"step": 27200
},
{
"epoch": 2.666015625,
"grad_norm": 11.179854393005371,
"learning_rate": 5.595703125e-06,
"loss": 2.707,
"step": 27300
},
{
"epoch": 2.67578125,
"grad_norm": 11.071802139282227,
"learning_rate": 5.432942708333333e-06,
"loss": 2.6166,
"step": 27400
},
{
"epoch": 2.685546875,
"grad_norm": 14.306278228759766,
"learning_rate": 5.270182291666667e-06,
"loss": 2.6747,
"step": 27500
},
{
"epoch": 2.6953125,
"grad_norm": 14.3062744140625,
"learning_rate": 5.107421875e-06,
"loss": 2.5382,
"step": 27600
},
{
"epoch": 2.705078125,
"grad_norm": 14.975716590881348,
"learning_rate": 4.944661458333334e-06,
"loss": 2.7215,
"step": 27700
},
{
"epoch": 2.71484375,
"grad_norm": 14.584077835083008,
"learning_rate": 4.781901041666667e-06,
"loss": 2.7351,
"step": 27800
},
{
"epoch": 2.724609375,
"grad_norm": 11.181657791137695,
"learning_rate": 4.619140625e-06,
"loss": 2.576,
"step": 27900
},
{
"epoch": 2.734375,
"grad_norm": 10.974489212036133,
"learning_rate": 4.456380208333333e-06,
"loss": 2.7631,
"step": 28000
},
{
"epoch": 2.744140625,
"grad_norm": 15.731523513793945,
"learning_rate": 4.295247395833334e-06,
"loss": 2.5512,
"step": 28100
},
{
"epoch": 2.75390625,
"grad_norm": 12.23338794708252,
"learning_rate": 4.132486979166667e-06,
"loss": 2.6233,
"step": 28200
},
{
"epoch": 2.763671875,
"grad_norm": 9.420981407165527,
"learning_rate": 3.9697265625e-06,
"loss": 2.6464,
"step": 28300
},
{
"epoch": 2.7734375,
"grad_norm": 10.947881698608398,
"learning_rate": 3.8069661458333335e-06,
"loss": 2.6197,
"step": 28400
},
{
"epoch": 2.783203125,
"grad_norm": 17.761030197143555,
"learning_rate": 3.644205729166667e-06,
"loss": 2.6341,
"step": 28500
},
{
"epoch": 2.79296875,
"grad_norm": 15.768482208251953,
"learning_rate": 3.4814453125e-06,
"loss": 2.653,
"step": 28600
},
{
"epoch": 2.802734375,
"grad_norm": 13.388958930969238,
"learning_rate": 3.3186848958333335e-06,
"loss": 2.5511,
"step": 28700
},
{
"epoch": 2.8125,
"grad_norm": 11.864120483398438,
"learning_rate": 3.155924479166667e-06,
"loss": 2.7781,
"step": 28800
},
{
"epoch": 2.822265625,
"grad_norm": 11.44416618347168,
"learning_rate": 2.9931640625e-06,
"loss": 2.6667,
"step": 28900
},
{
"epoch": 2.83203125,
"grad_norm": 13.012479782104492,
"learning_rate": 2.8304036458333335e-06,
"loss": 2.6217,
"step": 29000
},
{
"epoch": 2.841796875,
"grad_norm": 14.73644733428955,
"learning_rate": 2.667643229166667e-06,
"loss": 2.606,
"step": 29100
},
{
"epoch": 2.8515625,
"grad_norm": 12.075024604797363,
"learning_rate": 2.5048828125e-06,
"loss": 2.5863,
"step": 29200
},
{
"epoch": 2.861328125,
"grad_norm": 14.664973258972168,
"learning_rate": 2.3421223958333335e-06,
"loss": 2.7423,
"step": 29300
},
{
"epoch": 2.87109375,
"grad_norm": 14.096378326416016,
"learning_rate": 2.179361979166667e-06,
"loss": 2.7122,
"step": 29400
},
{
"epoch": 2.880859375,
"grad_norm": 12.812738418579102,
"learning_rate": 2.0166015625e-06,
"loss": 2.7648,
"step": 29500
},
{
"epoch": 2.890625,
"grad_norm": 16.189590454101562,
"learning_rate": 1.8538411458333335e-06,
"loss": 2.669,
"step": 29600
},
{
"epoch": 2.900390625,
"grad_norm": 14.232686996459961,
"learning_rate": 1.6910807291666667e-06,
"loss": 2.6242,
"step": 29700
},
{
"epoch": 2.91015625,
"grad_norm": 16.499113082885742,
"learning_rate": 1.5283203125000002e-06,
"loss": 2.7261,
"step": 29800
},
{
"epoch": 2.919921875,
"grad_norm": 10.421613693237305,
"learning_rate": 1.3655598958333332e-06,
"loss": 2.5805,
"step": 29900
},
{
"epoch": 2.9296875,
"grad_norm": 11.867895126342773,
"learning_rate": 1.2027994791666667e-06,
"loss": 2.6225,
"step": 30000
},
{
"epoch": 2.939453125,
"grad_norm": 14.396392822265625,
"learning_rate": 1.0400390625000002e-06,
"loss": 2.591,
"step": 30100
},
{
"epoch": 2.94921875,
"grad_norm": 26.637563705444336,
"learning_rate": 8.772786458333333e-07,
"loss": 2.6324,
"step": 30200
},
{
"epoch": 2.958984375,
"grad_norm": 13.43743896484375,
"learning_rate": 7.145182291666667e-07,
"loss": 2.7171,
"step": 30300
},
{
"epoch": 2.96875,
"grad_norm": 13.423970222473145,
"learning_rate": 5.517578125000001e-07,
"loss": 2.5509,
"step": 30400
},
{
"epoch": 2.978515625,
"grad_norm": 12.26883316040039,
"learning_rate": 3.889973958333334e-07,
"loss": 2.6303,
"step": 30500
},
{
"epoch": 2.98828125,
"grad_norm": 13.305875778198242,
"learning_rate": 2.262369791666667e-07,
"loss": 2.6542,
"step": 30600
},
{
"epoch": 2.998046875,
"grad_norm": 18.230594635009766,
"learning_rate": 6.347656250000001e-08,
"loss": 2.6088,
"step": 30700
},
{
"epoch": 3.0,
"step": 30720,
"total_flos": 4.206282006594048e+16,
"train_loss": 3.0830205624302227,
"train_runtime": 6816.3036,
"train_samples_per_second": 72.109,
"train_steps_per_second": 4.507
}
],
"logging_steps": 100,
"max_steps": 30720,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.206282006594048e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}