{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 30720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009765625, "grad_norm": NaN, "learning_rate": 4.984375e-05, "loss": 6.5469, "step": 100 }, { "epoch": 0.01953125, "grad_norm": 15.062654495239258, "learning_rate": 4.9682617187500003e-05, "loss": 5.3177, "step": 200 }, { "epoch": 0.029296875, "grad_norm": 13.20622730255127, "learning_rate": 4.9519856770833334e-05, "loss": 4.8951, "step": 300 }, { "epoch": 0.0390625, "grad_norm": 14.794780731201172, "learning_rate": 4.935709635416667e-05, "loss": 4.8799, "step": 400 }, { "epoch": 0.048828125, "grad_norm": 13.915122985839844, "learning_rate": 4.919596354166667e-05, "loss": 4.5381, "step": 500 }, { "epoch": 0.05859375, "grad_norm": 15.835251808166504, "learning_rate": 4.9033203125000005e-05, "loss": 4.4279, "step": 600 }, { "epoch": 0.068359375, "grad_norm": 14.109216690063477, "learning_rate": 4.8870442708333335e-05, "loss": 4.2612, "step": 700 }, { "epoch": 0.078125, "grad_norm": 13.910305976867676, "learning_rate": 4.8707682291666666e-05, "loss": 4.2912, "step": 800 }, { "epoch": 0.087890625, "grad_norm": 15.284989356994629, "learning_rate": 4.854654947916667e-05, "loss": 4.3003, "step": 900 }, { "epoch": 0.09765625, "grad_norm": 16.042747497558594, "learning_rate": 4.83837890625e-05, "loss": 4.0609, "step": 1000 }, { "epoch": 0.107421875, "grad_norm": 12.1378755569458, "learning_rate": 4.8221028645833336e-05, "loss": 4.1666, "step": 1100 }, { "epoch": 0.1171875, "grad_norm": 13.27160930633545, "learning_rate": 4.805826822916667e-05, "loss": 4.1419, "step": 1200 }, { "epoch": 0.126953125, "grad_norm": 16.022241592407227, "learning_rate": 4.78955078125e-05, "loss": 4.1586, "step": 1300 }, { "epoch": 0.13671875, "grad_norm": 17.19588851928711, "learning_rate": 4.7732747395833336e-05, "loss": 3.9012, "step": 1400 }, { "epoch": 0.146484375, "grad_norm": 13.319019317626953, "learning_rate": 4.756998697916667e-05, "loss": 4.017, "step": 1500 }, { "epoch": 0.15625, "grad_norm": 18.409399032592773, "learning_rate": 4.74072265625e-05, "loss": 3.9484, "step": 1600 }, { "epoch": 0.166015625, "grad_norm": 13.017057418823242, "learning_rate": 4.7244466145833336e-05, "loss": 3.9405, "step": 1700 }, { "epoch": 0.17578125, "grad_norm": 20.20134162902832, "learning_rate": 4.7081705729166667e-05, "loss": 3.8163, "step": 1800 }, { "epoch": 0.185546875, "grad_norm": 12.69709587097168, "learning_rate": 4.6918945312500004e-05, "loss": 3.8031, "step": 1900 }, { "epoch": 0.1953125, "grad_norm": 15.624823570251465, "learning_rate": 4.6756184895833335e-05, "loss": 3.862, "step": 2000 }, { "epoch": 0.205078125, "grad_norm": 18.287826538085938, "learning_rate": 4.659342447916667e-05, "loss": 3.8113, "step": 2100 }, { "epoch": 0.21484375, "grad_norm": 13.958403587341309, "learning_rate": 4.6430664062500004e-05, "loss": 3.7175, "step": 2200 }, { "epoch": 0.224609375, "grad_norm": 14.171894073486328, "learning_rate": 4.6267903645833335e-05, "loss": 3.8023, "step": 2300 }, { "epoch": 0.234375, "grad_norm": 13.721501350402832, "learning_rate": 4.610514322916667e-05, "loss": 3.7811, "step": 2400 }, { "epoch": 0.244140625, "grad_norm": 15.861538887023926, "learning_rate": 4.5942382812500003e-05, "loss": 3.6891, "step": 2500 }, { "epoch": 0.25390625, "grad_norm": 12.93659496307373, "learning_rate": 4.577962239583334e-05, "loss": 3.6613, "step": 2600 }, { "epoch": 0.263671875, "grad_norm": 14.38327407836914, "learning_rate": 4.561686197916667e-05, "loss": 4.0671, "step": 2700 }, { "epoch": 0.2734375, "grad_norm": 21.85599136352539, "learning_rate": 4.54541015625e-05, "loss": 3.6713, "step": 2800 }, { "epoch": 0.283203125, "grad_norm": 16.765880584716797, "learning_rate": 4.529134114583334e-05, "loss": 3.787, "step": 2900 }, { "epoch": 0.29296875, "grad_norm": 12.277482032775879, "learning_rate": 4.512858072916667e-05, "loss": 3.634, "step": 3000 }, { "epoch": 0.302734375, "grad_norm": 14.076525688171387, "learning_rate": 4.49658203125e-05, "loss": 3.6676, "step": 3100 }, { "epoch": 0.3125, "grad_norm": 54.203609466552734, "learning_rate": 4.480305989583334e-05, "loss": 3.4946, "step": 3200 }, { "epoch": 0.322265625, "grad_norm": 17.947622299194336, "learning_rate": 4.464029947916667e-05, "loss": 3.6398, "step": 3300 }, { "epoch": 0.33203125, "grad_norm": 12.46594524383545, "learning_rate": 4.44775390625e-05, "loss": 3.6395, "step": 3400 }, { "epoch": 0.341796875, "grad_norm": 14.41441822052002, "learning_rate": 4.431477864583334e-05, "loss": 3.5954, "step": 3500 }, { "epoch": 0.3515625, "grad_norm": 11.247319221496582, "learning_rate": 4.415201822916667e-05, "loss": 3.7537, "step": 3600 }, { "epoch": 0.361328125, "grad_norm": 15.799861907958984, "learning_rate": 4.39892578125e-05, "loss": 3.7104, "step": 3700 }, { "epoch": 0.37109375, "grad_norm": 13.855748176574707, "learning_rate": 4.382649739583334e-05, "loss": 3.6212, "step": 3800 }, { "epoch": 0.380859375, "grad_norm": 14.83035659790039, "learning_rate": 4.366373697916667e-05, "loss": 3.5802, "step": 3900 }, { "epoch": 0.390625, "grad_norm": 13.719660758972168, "learning_rate": 4.35009765625e-05, "loss": 3.4662, "step": 4000 }, { "epoch": 0.400390625, "grad_norm": NaN, "learning_rate": 4.333821614583334e-05, "loss": 3.5461, "step": 4100 }, { "epoch": 0.41015625, "grad_norm": 13.233614921569824, "learning_rate": 4.3177083333333334e-05, "loss": 3.5612, "step": 4200 }, { "epoch": 0.419921875, "grad_norm": 16.629112243652344, "learning_rate": 4.3014322916666665e-05, "loss": 3.4333, "step": 4300 }, { "epoch": 0.4296875, "grad_norm": 84.17444610595703, "learning_rate": 4.28515625e-05, "loss": 3.4819, "step": 4400 }, { "epoch": 0.439453125, "grad_norm": 14.212441444396973, "learning_rate": 4.268880208333333e-05, "loss": 3.4069, "step": 4500 }, { "epoch": 0.44921875, "grad_norm": 20.573959350585938, "learning_rate": 4.2526041666666664e-05, "loss": 3.6456, "step": 4600 }, { "epoch": 0.458984375, "grad_norm": 13.834342956542969, "learning_rate": 4.236328125e-05, "loss": 3.5382, "step": 4700 }, { "epoch": 0.46875, "grad_norm": 14.079811096191406, "learning_rate": 4.220052083333333e-05, "loss": 3.4105, "step": 4800 }, { "epoch": 0.478515625, "grad_norm": 12.509760856628418, "learning_rate": 4.2037760416666664e-05, "loss": 3.3461, "step": 4900 }, { "epoch": 0.48828125, "grad_norm": 14.564861297607422, "learning_rate": 4.1875e-05, "loss": 3.4041, "step": 5000 }, { "epoch": 0.498046875, "grad_norm": 14.737241744995117, "learning_rate": 4.171223958333333e-05, "loss": 3.507, "step": 5100 }, { "epoch": 0.5078125, "grad_norm": 15.188499450683594, "learning_rate": 4.154947916666666e-05, "loss": 3.4917, "step": 5200 }, { "epoch": 0.517578125, "grad_norm": 12.736299514770508, "learning_rate": 4.138671875e-05, "loss": 3.3674, "step": 5300 }, { "epoch": 0.52734375, "grad_norm": 13.326681137084961, "learning_rate": 4.122395833333333e-05, "loss": 3.4286, "step": 5400 }, { "epoch": 0.537109375, "grad_norm": 12.76373291015625, "learning_rate": 4.106119791666666e-05, "loss": 3.4353, "step": 5500 }, { "epoch": 0.546875, "grad_norm": 15.256406784057617, "learning_rate": 4.08984375e-05, "loss": 3.4297, "step": 5600 }, { "epoch": 0.556640625, "grad_norm": 11.880370140075684, "learning_rate": 4.073567708333333e-05, "loss": 3.2847, "step": 5700 }, { "epoch": 0.56640625, "grad_norm": 15.305662155151367, "learning_rate": 4.057291666666667e-05, "loss": 3.3476, "step": 5800 }, { "epoch": 0.576171875, "grad_norm": 24.128557205200195, "learning_rate": 4.041015625e-05, "loss": 3.4455, "step": 5900 }, { "epoch": 0.5859375, "grad_norm": 13.900318145751953, "learning_rate": 4.024739583333334e-05, "loss": 3.4846, "step": 6000 }, { "epoch": 0.595703125, "grad_norm": 11.240140914916992, "learning_rate": 4.008463541666667e-05, "loss": 3.4453, "step": 6100 }, { "epoch": 0.60546875, "grad_norm": 14.209588050842285, "learning_rate": 3.9921875e-05, "loss": 3.3685, "step": 6200 }, { "epoch": 0.615234375, "grad_norm": 16.545690536499023, "learning_rate": 3.975911458333334e-05, "loss": 3.3857, "step": 6300 }, { "epoch": 0.625, "grad_norm": 12.255586624145508, "learning_rate": 3.959635416666667e-05, "loss": 3.3588, "step": 6400 }, { "epoch": 0.634765625, "grad_norm": 15.788583755493164, "learning_rate": 3.9433593750000006e-05, "loss": 3.4022, "step": 6500 }, { "epoch": 0.64453125, "grad_norm": 11.233269691467285, "learning_rate": 3.92724609375e-05, "loss": 3.3202, "step": 6600 }, { "epoch": 0.654296875, "grad_norm": 16.585798263549805, "learning_rate": 3.910970052083334e-05, "loss": 3.2925, "step": 6700 }, { "epoch": 0.6640625, "grad_norm": 15.305195808410645, "learning_rate": 3.894694010416667e-05, "loss": 3.3162, "step": 6800 }, { "epoch": 0.673828125, "grad_norm": 11.933198928833008, "learning_rate": 3.87841796875e-05, "loss": 3.3133, "step": 6900 }, { "epoch": 0.68359375, "grad_norm": 15.607390403747559, "learning_rate": 3.862141927083334e-05, "loss": 3.3407, "step": 7000 }, { "epoch": 0.693359375, "grad_norm": 9.2060546875, "learning_rate": 3.845865885416667e-05, "loss": 3.3973, "step": 7100 }, { "epoch": 0.703125, "grad_norm": 17.788448333740234, "learning_rate": 3.82958984375e-05, "loss": 3.261, "step": 7200 }, { "epoch": 0.712890625, "grad_norm": 13.37771224975586, "learning_rate": 3.813313802083334e-05, "loss": 3.3159, "step": 7300 }, { "epoch": 0.72265625, "grad_norm": 13.427688598632812, "learning_rate": 3.797037760416667e-05, "loss": 3.2451, "step": 7400 }, { "epoch": 0.732421875, "grad_norm": 12.314142227172852, "learning_rate": 3.78076171875e-05, "loss": 3.2721, "step": 7500 }, { "epoch": 0.7421875, "grad_norm": 11.418351173400879, "learning_rate": 3.764485677083334e-05, "loss": 3.2943, "step": 7600 }, { "epoch": 0.751953125, "grad_norm": 16.546539306640625, "learning_rate": 3.748209635416667e-05, "loss": 3.2243, "step": 7700 }, { "epoch": 0.76171875, "grad_norm": 16.984668731689453, "learning_rate": 3.73193359375e-05, "loss": 3.3024, "step": 7800 }, { "epoch": 0.771484375, "grad_norm": 11.702521324157715, "learning_rate": 3.715657552083334e-05, "loss": 3.1967, "step": 7900 }, { "epoch": 0.78125, "grad_norm": 12.537822723388672, "learning_rate": 3.699381510416667e-05, "loss": 3.2535, "step": 8000 }, { "epoch": 0.791015625, "grad_norm": 13.640584945678711, "learning_rate": 3.68310546875e-05, "loss": 3.1677, "step": 8100 }, { "epoch": 0.80078125, "grad_norm": 15.423649787902832, "learning_rate": 3.6668294270833336e-05, "loss": 3.2641, "step": 8200 }, { "epoch": 0.810546875, "grad_norm": 12.065776824951172, "learning_rate": 3.650553385416667e-05, "loss": 3.1169, "step": 8300 }, { "epoch": 0.8203125, "grad_norm": 14.259243965148926, "learning_rate": 3.63427734375e-05, "loss": 3.2357, "step": 8400 }, { "epoch": 0.830078125, "grad_norm": 17.08042335510254, "learning_rate": 3.6180013020833336e-05, "loss": 3.3104, "step": 8500 }, { "epoch": 0.83984375, "grad_norm": 10.867400169372559, "learning_rate": 3.601725260416667e-05, "loss": 3.296, "step": 8600 }, { "epoch": 0.849609375, "grad_norm": 13.106012344360352, "learning_rate": 3.58544921875e-05, "loss": 3.2956, "step": 8700 }, { "epoch": 0.859375, "grad_norm": 15.86103630065918, "learning_rate": 3.5691731770833335e-05, "loss": 3.2569, "step": 8800 }, { "epoch": 0.869140625, "grad_norm": 11.86782169342041, "learning_rate": 3.5528971354166666e-05, "loss": 3.1775, "step": 8900 }, { "epoch": 0.87890625, "grad_norm": 13.925124168395996, "learning_rate": 3.53662109375e-05, "loss": 3.1556, "step": 9000 }, { "epoch": 0.888671875, "grad_norm": 15.298331260681152, "learning_rate": 3.5203450520833335e-05, "loss": 3.2081, "step": 9100 }, { "epoch": 0.8984375, "grad_norm": 12.989383697509766, "learning_rate": 3.5040690104166666e-05, "loss": 3.2758, "step": 9200 }, { "epoch": 0.908203125, "grad_norm": 12.797562599182129, "learning_rate": 3.48779296875e-05, "loss": 3.0895, "step": 9300 }, { "epoch": 0.91796875, "grad_norm": 10.773366928100586, "learning_rate": 3.4715169270833335e-05, "loss": 3.1352, "step": 9400 }, { "epoch": 0.927734375, "grad_norm": 13.436513900756836, "learning_rate": 3.4552408854166665e-05, "loss": 3.2934, "step": 9500 }, { "epoch": 0.9375, "grad_norm": 12.20578670501709, "learning_rate": 3.4389648437499996e-05, "loss": 3.22, "step": 9600 }, { "epoch": 0.947265625, "grad_norm": 15.03205680847168, "learning_rate": 3.4226888020833334e-05, "loss": 3.1925, "step": 9700 }, { "epoch": 0.95703125, "grad_norm": 15.373735427856445, "learning_rate": 3.4064127604166665e-05, "loss": 3.1056, "step": 9800 }, { "epoch": 0.966796875, "grad_norm": 13.795890808105469, "learning_rate": 3.39013671875e-05, "loss": 3.1484, "step": 9900 }, { "epoch": 0.9765625, "grad_norm": 15.731973648071289, "learning_rate": 3.3738606770833334e-05, "loss": 3.2102, "step": 10000 }, { "epoch": 0.986328125, "grad_norm": 14.516192436218262, "learning_rate": 3.3575846354166665e-05, "loss": 3.1795, "step": 10100 }, { "epoch": 0.99609375, "grad_norm": 11.511063575744629, "learning_rate": 3.34130859375e-05, "loss": 3.1479, "step": 10200 }, { "epoch": 1.005859375, "grad_norm": 13.947763442993164, "learning_rate": 3.325032552083333e-05, "loss": 3.1697, "step": 10300 }, { "epoch": 1.015625, "grad_norm": 9.85244369506836, "learning_rate": 3.308756510416667e-05, "loss": 3.1739, "step": 10400 }, { "epoch": 1.025390625, "grad_norm": 21.973459243774414, "learning_rate": 3.29248046875e-05, "loss": 2.9587, "step": 10500 }, { "epoch": 1.03515625, "grad_norm": 11.420741081237793, "learning_rate": 3.276204427083334e-05, "loss": 3.1121, "step": 10600 }, { "epoch": 1.044921875, "grad_norm": 10.605298042297363, "learning_rate": 3.259928385416667e-05, "loss": 3.0928, "step": 10700 }, { "epoch": 1.0546875, "grad_norm": 15.158004760742188, "learning_rate": 3.24365234375e-05, "loss": 3.0272, "step": 10800 }, { "epoch": 1.064453125, "grad_norm": 12.386876106262207, "learning_rate": 3.227376302083334e-05, "loss": 3.055, "step": 10900 }, { "epoch": 1.07421875, "grad_norm": 14.404585838317871, "learning_rate": 3.211100260416667e-05, "loss": 3.0203, "step": 11000 }, { "epoch": 1.083984375, "grad_norm": 14.187972068786621, "learning_rate": 3.19482421875e-05, "loss": 3.1315, "step": 11100 }, { "epoch": 1.09375, "grad_norm": 11.506830215454102, "learning_rate": 3.178548177083334e-05, "loss": 3.0323, "step": 11200 }, { "epoch": 1.103515625, "grad_norm": 14.504237174987793, "learning_rate": 3.162272135416667e-05, "loss": 2.9987, "step": 11300 }, { "epoch": 1.11328125, "grad_norm": 12.95081615447998, "learning_rate": 3.14599609375e-05, "loss": 2.9773, "step": 11400 }, { "epoch": 1.123046875, "grad_norm": 12.512577056884766, "learning_rate": 3.129720052083334e-05, "loss": 3.1454, "step": 11500 }, { "epoch": 1.1328125, "grad_norm": 11.951395988464355, "learning_rate": 3.113444010416667e-05, "loss": 2.9314, "step": 11600 }, { "epoch": 1.142578125, "grad_norm": 16.119001388549805, "learning_rate": 3.09716796875e-05, "loss": 2.9137, "step": 11700 }, { "epoch": 1.15234375, "grad_norm": 10.542379379272461, "learning_rate": 3.080891927083334e-05, "loss": 3.1176, "step": 11800 }, { "epoch": 1.162109375, "grad_norm": 14.544652938842773, "learning_rate": 3.064615885416667e-05, "loss": 2.9937, "step": 11900 }, { "epoch": 1.171875, "grad_norm": 11.01784896850586, "learning_rate": 3.04833984375e-05, "loss": 3.0301, "step": 12000 }, { "epoch": 1.181640625, "grad_norm": 12.359283447265625, "learning_rate": 3.0320638020833337e-05, "loss": 2.9277, "step": 12100 }, { "epoch": 1.19140625, "grad_norm": 18.171897888183594, "learning_rate": 3.015787760416667e-05, "loss": 3.0673, "step": 12200 }, { "epoch": 1.201171875, "grad_norm": 12.454042434692383, "learning_rate": 2.99951171875e-05, "loss": 2.9398, "step": 12300 }, { "epoch": 1.2109375, "grad_norm": 12.611340522766113, "learning_rate": 2.9832356770833337e-05, "loss": 2.926, "step": 12400 }, { "epoch": 1.220703125, "grad_norm": 12.207980155944824, "learning_rate": 2.9669596354166668e-05, "loss": 2.9713, "step": 12500 }, { "epoch": 1.23046875, "grad_norm": 11.924321174621582, "learning_rate": 2.95068359375e-05, "loss": 3.0734, "step": 12600 }, { "epoch": 1.240234375, "grad_norm": 10.981707572937012, "learning_rate": 2.9344075520833337e-05, "loss": 2.983, "step": 12700 }, { "epoch": 1.25, "grad_norm": 9.699291229248047, "learning_rate": 2.9181315104166667e-05, "loss": 3.0235, "step": 12800 }, { "epoch": 1.259765625, "grad_norm": 11.410511016845703, "learning_rate": 2.90185546875e-05, "loss": 3.081, "step": 12900 }, { "epoch": 1.26953125, "grad_norm": 20.204944610595703, "learning_rate": 2.8855794270833336e-05, "loss": 2.958, "step": 13000 }, { "epoch": 1.279296875, "grad_norm": 9.388766288757324, "learning_rate": 2.8693033854166667e-05, "loss": 3.0007, "step": 13100 }, { "epoch": 1.2890625, "grad_norm": 11.05562973022461, "learning_rate": 2.8530273437499998e-05, "loss": 3.0186, "step": 13200 }, { "epoch": 1.298828125, "grad_norm": 12.052275657653809, "learning_rate": 2.8367513020833336e-05, "loss": 2.9891, "step": 13300 }, { "epoch": 1.30859375, "grad_norm": 17.93643569946289, "learning_rate": 2.8204752604166667e-05, "loss": 2.9863, "step": 13400 }, { "epoch": 1.318359375, "grad_norm": 16.745187759399414, "learning_rate": 2.8041992187499998e-05, "loss": 2.963, "step": 13500 }, { "epoch": 1.328125, "grad_norm": 11.541468620300293, "learning_rate": 2.7879231770833335e-05, "loss": 3.0955, "step": 13600 }, { "epoch": 1.337890625, "grad_norm": 12.31360912322998, "learning_rate": 2.7716471354166666e-05, "loss": 3.0445, "step": 13700 }, { "epoch": 1.34765625, "grad_norm": 12.349579811096191, "learning_rate": 2.75537109375e-05, "loss": 2.9165, "step": 13800 }, { "epoch": 1.357421875, "grad_norm": 11.030657768249512, "learning_rate": 2.7390950520833335e-05, "loss": 2.9315, "step": 13900 }, { "epoch": 1.3671875, "grad_norm": 14.333861351013184, "learning_rate": 2.722819010416667e-05, "loss": 3.0059, "step": 14000 }, { "epoch": 1.376953125, "grad_norm": 15.17336368560791, "learning_rate": 2.70654296875e-05, "loss": 3.0364, "step": 14100 }, { "epoch": 1.38671875, "grad_norm": 14.133316040039062, "learning_rate": 2.6902669270833338e-05, "loss": 3.0579, "step": 14200 }, { "epoch": 1.396484375, "grad_norm": 11.148407936096191, "learning_rate": 2.673990885416667e-05, "loss": 2.975, "step": 14300 }, { "epoch": 1.40625, "grad_norm": 14.075923919677734, "learning_rate": 2.65771484375e-05, "loss": 3.0044, "step": 14400 }, { "epoch": 1.416015625, "grad_norm": 14.998821258544922, "learning_rate": 2.6414388020833337e-05, "loss": 2.9768, "step": 14500 }, { "epoch": 1.42578125, "grad_norm": 12.305022239685059, "learning_rate": 2.6251627604166668e-05, "loss": 2.9711, "step": 14600 }, { "epoch": 1.435546875, "grad_norm": 16.09569549560547, "learning_rate": 2.609049479166667e-05, "loss": 2.8913, "step": 14700 }, { "epoch": 1.4453125, "grad_norm": 10.328545570373535, "learning_rate": 2.5927734375e-05, "loss": 2.9569, "step": 14800 }, { "epoch": 1.455078125, "grad_norm": 10.365655899047852, "learning_rate": 2.576497395833333e-05, "loss": 2.9558, "step": 14900 }, { "epoch": 1.46484375, "grad_norm": 11.824304580688477, "learning_rate": 2.560221354166667e-05, "loss": 2.906, "step": 15000 }, { "epoch": 1.474609375, "grad_norm": 13.518112182617188, "learning_rate": 2.5439453125e-05, "loss": 2.9135, "step": 15100 }, { "epoch": 1.484375, "grad_norm": 13.241069793701172, "learning_rate": 2.527669270833333e-05, "loss": 2.8683, "step": 15200 }, { "epoch": 1.494140625, "grad_norm": 10.397425651550293, "learning_rate": 2.5115559895833336e-05, "loss": 2.9151, "step": 15300 }, { "epoch": 1.50390625, "grad_norm": 9.971900939941406, "learning_rate": 2.495279947916667e-05, "loss": 2.8387, "step": 15400 }, { "epoch": 1.513671875, "grad_norm": 12.148921966552734, "learning_rate": 2.47900390625e-05, "loss": 3.0045, "step": 15500 }, { "epoch": 1.5234375, "grad_norm": 12.122273445129395, "learning_rate": 2.4627278645833336e-05, "loss": 2.8201, "step": 15600 }, { "epoch": 1.533203125, "grad_norm": 10.754613876342773, "learning_rate": 2.446451822916667e-05, "loss": 3.0412, "step": 15700 }, { "epoch": 1.54296875, "grad_norm": 12.5385103225708, "learning_rate": 2.43017578125e-05, "loss": 2.9698, "step": 15800 }, { "epoch": 1.552734375, "grad_norm": 12.530341148376465, "learning_rate": 2.4138997395833335e-05, "loss": 2.9844, "step": 15900 }, { "epoch": 1.5625, "grad_norm": 12.995160102844238, "learning_rate": 2.397623697916667e-05, "loss": 2.9505, "step": 16000 }, { "epoch": 1.572265625, "grad_norm": 11.06143856048584, "learning_rate": 2.38134765625e-05, "loss": 2.9299, "step": 16100 }, { "epoch": 1.58203125, "grad_norm": 11.600089073181152, "learning_rate": 2.3650716145833335e-05, "loss": 2.9184, "step": 16200 }, { "epoch": 1.591796875, "grad_norm": 9.294841766357422, "learning_rate": 2.348795572916667e-05, "loss": 3.0312, "step": 16300 }, { "epoch": 1.6015625, "grad_norm": 13.555610656738281, "learning_rate": 2.33251953125e-05, "loss": 2.899, "step": 16400 }, { "epoch": 1.611328125, "grad_norm": 11.077611923217773, "learning_rate": 2.3162434895833334e-05, "loss": 2.9032, "step": 16500 }, { "epoch": 1.62109375, "grad_norm": 12.092378616333008, "learning_rate": 2.299967447916667e-05, "loss": 3.0475, "step": 16600 }, { "epoch": 1.630859375, "grad_norm": 18.95319175720215, "learning_rate": 2.28369140625e-05, "loss": 2.8485, "step": 16700 }, { "epoch": 1.640625, "grad_norm": 20.765520095825195, "learning_rate": 2.2674153645833334e-05, "loss": 3.0211, "step": 16800 }, { "epoch": 1.650390625, "grad_norm": 19.739500045776367, "learning_rate": 2.2511393229166668e-05, "loss": 3.0398, "step": 16900 }, { "epoch": 1.66015625, "grad_norm": 12.64313793182373, "learning_rate": 2.23486328125e-05, "loss": 2.8028, "step": 17000 }, { "epoch": 1.669921875, "grad_norm": 19.063640594482422, "learning_rate": 2.2185872395833333e-05, "loss": 2.896, "step": 17100 }, { "epoch": 1.6796875, "grad_norm": 23.34886932373047, "learning_rate": 2.2023111979166668e-05, "loss": 2.9162, "step": 17200 }, { "epoch": 1.689453125, "grad_norm": 11.44904899597168, "learning_rate": 2.1860351562500002e-05, "loss": 2.9423, "step": 17300 }, { "epoch": 1.69921875, "grad_norm": 16.1793155670166, "learning_rate": 2.1697591145833336e-05, "loss": 2.917, "step": 17400 }, { "epoch": 1.708984375, "grad_norm": 9.111202239990234, "learning_rate": 2.1534830729166667e-05, "loss": 2.9147, "step": 17500 }, { "epoch": 1.71875, "grad_norm": 12.297972679138184, "learning_rate": 2.13720703125e-05, "loss": 2.8541, "step": 17600 }, { "epoch": 1.728515625, "grad_norm": 14.833362579345703, "learning_rate": 2.1209309895833336e-05, "loss": 2.9548, "step": 17700 }, { "epoch": 1.73828125, "grad_norm": 11.36043643951416, "learning_rate": 2.1048177083333334e-05, "loss": 2.8732, "step": 17800 }, { "epoch": 1.748046875, "grad_norm": 12.69233512878418, "learning_rate": 2.088541666666667e-05, "loss": 2.9447, "step": 17900 }, { "epoch": 1.7578125, "grad_norm": 13.44200611114502, "learning_rate": 2.072265625e-05, "loss": 2.8824, "step": 18000 }, { "epoch": 1.767578125, "grad_norm": 10.753628730773926, "learning_rate": 2.05615234375e-05, "loss": 2.8455, "step": 18100 }, { "epoch": 1.77734375, "grad_norm": 13.760424613952637, "learning_rate": 2.0398763020833335e-05, "loss": 2.9853, "step": 18200 }, { "epoch": 1.787109375, "grad_norm": 16.471168518066406, "learning_rate": 2.023600260416667e-05, "loss": 2.8303, "step": 18300 }, { "epoch": 1.796875, "grad_norm": 13.708788871765137, "learning_rate": 2.0073242187500004e-05, "loss": 2.9086, "step": 18400 }, { "epoch": 1.806640625, "grad_norm": 13.522102355957031, "learning_rate": 1.9912109375000002e-05, "loss": 2.9567, "step": 18500 }, { "epoch": 1.81640625, "grad_norm": 14.42663288116455, "learning_rate": 1.9749348958333333e-05, "loss": 2.8666, "step": 18600 }, { "epoch": 1.826171875, "grad_norm": 10.03260326385498, "learning_rate": 1.9586588541666667e-05, "loss": 2.9193, "step": 18700 }, { "epoch": 1.8359375, "grad_norm": 10.757763862609863, "learning_rate": 1.9423828125e-05, "loss": 3.0071, "step": 18800 }, { "epoch": 1.845703125, "grad_norm": 12.414703369140625, "learning_rate": 1.9261067708333332e-05, "loss": 2.9446, "step": 18900 }, { "epoch": 1.85546875, "grad_norm": 12.182251930236816, "learning_rate": 1.9098307291666667e-05, "loss": 2.9584, "step": 19000 }, { "epoch": 1.865234375, "grad_norm": 13.588275909423828, "learning_rate": 1.8935546875e-05, "loss": 2.8303, "step": 19100 }, { "epoch": 1.875, "grad_norm": 11.538961410522461, "learning_rate": 1.8772786458333332e-05, "loss": 2.9129, "step": 19200 }, { "epoch": 1.884765625, "grad_norm": 14.856468200683594, "learning_rate": 1.8610026041666666e-05, "loss": 2.8229, "step": 19300 }, { "epoch": 1.89453125, "grad_norm": 12.910444259643555, "learning_rate": 1.8447265625e-05, "loss": 2.9851, "step": 19400 }, { "epoch": 1.904296875, "grad_norm": 12.187970161437988, "learning_rate": 1.828450520833333e-05, "loss": 2.863, "step": 19500 }, { "epoch": 1.9140625, "grad_norm": 16.951251983642578, "learning_rate": 1.8121744791666666e-05, "loss": 2.8726, "step": 19600 }, { "epoch": 1.923828125, "grad_norm": 10.626237869262695, "learning_rate": 1.7958984375e-05, "loss": 2.79, "step": 19700 }, { "epoch": 1.93359375, "grad_norm": 13.63193416595459, "learning_rate": 1.7796223958333334e-05, "loss": 2.8807, "step": 19800 }, { "epoch": 1.943359375, "grad_norm": 12.877738952636719, "learning_rate": 1.763346354166667e-05, "loss": 2.7945, "step": 19900 }, { "epoch": 1.953125, "grad_norm": 12.08793830871582, "learning_rate": 1.7470703125000003e-05, "loss": 2.8932, "step": 20000 }, { "epoch": 1.962890625, "grad_norm": 12.529877662658691, "learning_rate": 1.7307942708333334e-05, "loss": 2.9178, "step": 20100 }, { "epoch": 1.97265625, "grad_norm": 12.949920654296875, "learning_rate": 1.7145182291666668e-05, "loss": 2.8207, "step": 20200 }, { "epoch": 1.982421875, "grad_norm": 16.327682495117188, "learning_rate": 1.6982421875000003e-05, "loss": 2.9098, "step": 20300 }, { "epoch": 1.9921875, "grad_norm": 13.496146202087402, "learning_rate": 1.6819661458333334e-05, "loss": 2.7364, "step": 20400 }, { "epoch": 2.001953125, "grad_norm": 18.41584014892578, "learning_rate": 1.6656901041666668e-05, "loss": 2.8233, "step": 20500 }, { "epoch": 2.01171875, "grad_norm": 14.442363739013672, "learning_rate": 1.6494140625000002e-05, "loss": 2.7373, "step": 20600 }, { "epoch": 2.021484375, "grad_norm": 13.7149019241333, "learning_rate": 1.6331380208333333e-05, "loss": 2.7693, "step": 20700 }, { "epoch": 2.03125, "grad_norm": 17.81890296936035, "learning_rate": 1.6168619791666667e-05, "loss": 2.7528, "step": 20800 }, { "epoch": 2.041015625, "grad_norm": 11.57737922668457, "learning_rate": 1.6007486979166666e-05, "loss": 2.7167, "step": 20900 }, { "epoch": 2.05078125, "grad_norm": 12.967355728149414, "learning_rate": 1.58447265625e-05, "loss": 2.7034, "step": 21000 }, { "epoch": 2.060546875, "grad_norm": 10.983343124389648, "learning_rate": 1.568359375e-05, "loss": 2.7766, "step": 21100 }, { "epoch": 2.0703125, "grad_norm": 16.468509674072266, "learning_rate": 1.5520833333333336e-05, "loss": 2.7992, "step": 21200 }, { "epoch": 2.080078125, "grad_norm": 16.79848861694336, "learning_rate": 1.535807291666667e-05, "loss": 2.7826, "step": 21300 }, { "epoch": 2.08984375, "grad_norm": 16.537952423095703, "learning_rate": 1.5195312500000001e-05, "loss": 2.7119, "step": 21400 }, { "epoch": 2.099609375, "grad_norm": 11.28368854522705, "learning_rate": 1.5032552083333335e-05, "loss": 2.7093, "step": 21500 }, { "epoch": 2.109375, "grad_norm": 58.54060745239258, "learning_rate": 1.4869791666666668e-05, "loss": 2.7219, "step": 21600 }, { "epoch": 2.119140625, "grad_norm": 12.917949676513672, "learning_rate": 1.470703125e-05, "loss": 2.8239, "step": 21700 }, { "epoch": 2.12890625, "grad_norm": 18.68291664123535, "learning_rate": 1.4544270833333335e-05, "loss": 2.7574, "step": 21800 }, { "epoch": 2.138671875, "grad_norm": 11.076837539672852, "learning_rate": 1.4381510416666669e-05, "loss": 2.8418, "step": 21900 }, { "epoch": 2.1484375, "grad_norm": 16.057594299316406, "learning_rate": 1.421875e-05, "loss": 2.822, "step": 22000 }, { "epoch": 2.158203125, "grad_norm": 17.087045669555664, "learning_rate": 1.4055989583333334e-05, "loss": 2.7588, "step": 22100 }, { "epoch": 2.16796875, "grad_norm": 11.648773193359375, "learning_rate": 1.3893229166666669e-05, "loss": 2.717, "step": 22200 }, { "epoch": 2.177734375, "grad_norm": 12.739274024963379, "learning_rate": 1.373046875e-05, "loss": 2.6956, "step": 22300 }, { "epoch": 2.1875, "grad_norm": 10.17639446258545, "learning_rate": 1.3567708333333334e-05, "loss": 2.8241, "step": 22400 }, { "epoch": 2.197265625, "grad_norm": 13.572341918945312, "learning_rate": 1.3404947916666668e-05, "loss": 2.7858, "step": 22500 }, { "epoch": 2.20703125, "grad_norm": 14.310699462890625, "learning_rate": 1.3242187500000001e-05, "loss": 2.6909, "step": 22600 }, { "epoch": 2.216796875, "grad_norm": 11.991633415222168, "learning_rate": 1.3079427083333335e-05, "loss": 2.7708, "step": 22700 }, { "epoch": 2.2265625, "grad_norm": 14.214717864990234, "learning_rate": 1.2916666666666668e-05, "loss": 2.7603, "step": 22800 }, { "epoch": 2.236328125, "grad_norm": 16.019987106323242, "learning_rate": 1.275390625e-05, "loss": 2.7493, "step": 22900 }, { "epoch": 2.24609375, "grad_norm": 11.817761421203613, "learning_rate": 1.2591145833333335e-05, "loss": 2.7647, "step": 23000 }, { "epoch": 2.255859375, "grad_norm": 16.247276306152344, "learning_rate": 1.2430013020833335e-05, "loss": 2.7605, "step": 23100 }, { "epoch": 2.265625, "grad_norm": 11.79404067993164, "learning_rate": 1.2267252604166667e-05, "loss": 2.7058, "step": 23200 }, { "epoch": 2.275390625, "grad_norm": 12.77724838256836, "learning_rate": 1.2104492187500001e-05, "loss": 2.7446, "step": 23300 }, { "epoch": 2.28515625, "grad_norm": 11.609589576721191, "learning_rate": 1.1941731770833334e-05, "loss": 2.7794, "step": 23400 }, { "epoch": 2.294921875, "grad_norm": 13.240425109863281, "learning_rate": 1.1780598958333334e-05, "loss": 2.8056, "step": 23500 }, { "epoch": 2.3046875, "grad_norm": 15.682677268981934, "learning_rate": 1.1617838541666668e-05, "loss": 2.7102, "step": 23600 }, { "epoch": 2.314453125, "grad_norm": 15.334482192993164, "learning_rate": 1.1455078125e-05, "loss": 2.8026, "step": 23700 }, { "epoch": 2.32421875, "grad_norm": 11.944014549255371, "learning_rate": 1.1292317708333335e-05, "loss": 2.7106, "step": 23800 }, { "epoch": 2.333984375, "grad_norm": 13.437361717224121, "learning_rate": 1.1129557291666668e-05, "loss": 2.6837, "step": 23900 }, { "epoch": 2.34375, "grad_norm": 15.150136947631836, "learning_rate": 1.0966796875e-05, "loss": 2.7095, "step": 24000 }, { "epoch": 2.353515625, "grad_norm": 13.133088111877441, "learning_rate": 1.0804036458333335e-05, "loss": 2.7609, "step": 24100 }, { "epoch": 2.36328125, "grad_norm": 12.005653381347656, "learning_rate": 1.0641276041666667e-05, "loss": 2.7086, "step": 24200 }, { "epoch": 2.373046875, "grad_norm": 20.258712768554688, "learning_rate": 1.0478515625e-05, "loss": 2.8082, "step": 24300 }, { "epoch": 2.3828125, "grad_norm": 14.602194786071777, "learning_rate": 1.0315755208333334e-05, "loss": 2.6246, "step": 24400 }, { "epoch": 2.392578125, "grad_norm": 13.00714111328125, "learning_rate": 1.0152994791666667e-05, "loss": 2.8249, "step": 24500 }, { "epoch": 2.40234375, "grad_norm": 11.645508766174316, "learning_rate": 9.990234375e-06, "loss": 2.7555, "step": 24600 }, { "epoch": 2.412109375, "grad_norm": 17.61017417907715, "learning_rate": 9.827473958333334e-06, "loss": 2.6942, "step": 24700 }, { "epoch": 2.421875, "grad_norm": 12.34157943725586, "learning_rate": 9.664713541666668e-06, "loss": 2.7215, "step": 24800 }, { "epoch": 2.431640625, "grad_norm": 12.765501976013184, "learning_rate": 9.501953125e-06, "loss": 2.5747, "step": 24900 }, { "epoch": 2.44140625, "grad_norm": 12.85317611694336, "learning_rate": 9.339192708333335e-06, "loss": 2.7917, "step": 25000 }, { "epoch": 2.451171875, "grad_norm": 12.610406875610352, "learning_rate": 9.176432291666668e-06, "loss": 2.8635, "step": 25100 }, { "epoch": 2.4609375, "grad_norm": 12.031999588012695, "learning_rate": 9.013671875e-06, "loss": 2.6288, "step": 25200 }, { "epoch": 2.470703125, "grad_norm": 10.63759994506836, "learning_rate": 8.850911458333335e-06, "loss": 2.7396, "step": 25300 }, { "epoch": 2.48046875, "grad_norm": 18.768993377685547, "learning_rate": 8.688151041666667e-06, "loss": 2.6733, "step": 25400 }, { "epoch": 2.490234375, "grad_norm": 9.411011695861816, "learning_rate": 8.525390625e-06, "loss": 2.7355, "step": 25500 }, { "epoch": 2.5, "grad_norm": 11.31039810180664, "learning_rate": 8.362630208333334e-06, "loss": 2.6648, "step": 25600 }, { "epoch": 2.509765625, "grad_norm": 11.34940242767334, "learning_rate": 8.199869791666667e-06, "loss": 2.6185, "step": 25700 }, { "epoch": 2.51953125, "grad_norm": 13.539913177490234, "learning_rate": 8.037109375e-06, "loss": 2.6344, "step": 25800 }, { "epoch": 2.529296875, "grad_norm": 12.985583305358887, "learning_rate": 7.874348958333334e-06, "loss": 2.7028, "step": 25900 }, { "epoch": 2.5390625, "grad_norm": 23.021692276000977, "learning_rate": 7.711588541666666e-06, "loss": 2.6719, "step": 26000 }, { "epoch": 2.548828125, "grad_norm": 14.796003341674805, "learning_rate": 7.548828125e-06, "loss": 2.669, "step": 26100 }, { "epoch": 2.55859375, "grad_norm": 16.303749084472656, "learning_rate": 7.386067708333334e-06, "loss": 2.5954, "step": 26200 }, { "epoch": 2.568359375, "grad_norm": 10.631623268127441, "learning_rate": 7.223307291666667e-06, "loss": 2.669, "step": 26300 }, { "epoch": 2.578125, "grad_norm": 13.405618667602539, "learning_rate": 7.060546875e-06, "loss": 2.7102, "step": 26400 }, { "epoch": 2.587890625, "grad_norm": 16.89972496032715, "learning_rate": 6.897786458333335e-06, "loss": 2.7198, "step": 26500 }, { "epoch": 2.59765625, "grad_norm": 14.60909652709961, "learning_rate": 6.735026041666667e-06, "loss": 2.686, "step": 26600 }, { "epoch": 2.607421875, "grad_norm": 16.859573364257812, "learning_rate": 6.572265625e-06, "loss": 2.7073, "step": 26700 }, { "epoch": 2.6171875, "grad_norm": 12.876221656799316, "learning_rate": 6.409505208333334e-06, "loss": 2.66, "step": 26800 }, { "epoch": 2.626953125, "grad_norm": 12.116389274597168, "learning_rate": 6.246744791666667e-06, "loss": 2.6773, "step": 26900 }, { "epoch": 2.63671875, "grad_norm": 13.397444725036621, "learning_rate": 6.083984375e-06, "loss": 2.5832, "step": 27000 }, { "epoch": 2.646484375, "grad_norm": 14.27937126159668, "learning_rate": 5.921223958333334e-06, "loss": 2.7137, "step": 27100 }, { "epoch": 2.65625, "grad_norm": 12.069489479064941, "learning_rate": 5.758463541666667e-06, "loss": 2.6435, "step": 27200 }, { "epoch": 2.666015625, "grad_norm": 11.179854393005371, "learning_rate": 5.595703125e-06, "loss": 2.707, "step": 27300 }, { "epoch": 2.67578125, "grad_norm": 11.071802139282227, "learning_rate": 5.432942708333333e-06, "loss": 2.6166, "step": 27400 }, { "epoch": 2.685546875, "grad_norm": 14.306278228759766, "learning_rate": 5.270182291666667e-06, "loss": 2.6747, "step": 27500 }, { "epoch": 2.6953125, "grad_norm": 14.3062744140625, "learning_rate": 5.107421875e-06, "loss": 2.5382, "step": 27600 }, { "epoch": 2.705078125, "grad_norm": 14.975716590881348, "learning_rate": 4.944661458333334e-06, "loss": 2.7215, "step": 27700 }, { "epoch": 2.71484375, "grad_norm": 14.584077835083008, "learning_rate": 4.781901041666667e-06, "loss": 2.7351, "step": 27800 }, { "epoch": 2.724609375, "grad_norm": 11.181657791137695, "learning_rate": 4.619140625e-06, "loss": 2.576, "step": 27900 }, { "epoch": 2.734375, "grad_norm": 10.974489212036133, "learning_rate": 4.456380208333333e-06, "loss": 2.7631, "step": 28000 }, { "epoch": 2.744140625, "grad_norm": 15.731523513793945, "learning_rate": 4.295247395833334e-06, "loss": 2.5512, "step": 28100 }, { "epoch": 2.75390625, "grad_norm": 12.23338794708252, "learning_rate": 4.132486979166667e-06, "loss": 2.6233, "step": 28200 }, { "epoch": 2.763671875, "grad_norm": 9.420981407165527, "learning_rate": 3.9697265625e-06, "loss": 2.6464, "step": 28300 }, { "epoch": 2.7734375, "grad_norm": 10.947881698608398, "learning_rate": 3.8069661458333335e-06, "loss": 2.6197, "step": 28400 }, { "epoch": 2.783203125, "grad_norm": 17.761030197143555, "learning_rate": 3.644205729166667e-06, "loss": 2.6341, "step": 28500 }, { "epoch": 2.79296875, "grad_norm": 15.768482208251953, "learning_rate": 3.4814453125e-06, "loss": 2.653, "step": 28600 }, { "epoch": 2.802734375, "grad_norm": 13.388958930969238, "learning_rate": 3.3186848958333335e-06, "loss": 2.5511, "step": 28700 }, { "epoch": 2.8125, "grad_norm": 11.864120483398438, "learning_rate": 3.155924479166667e-06, "loss": 2.7781, "step": 28800 }, { "epoch": 2.822265625, "grad_norm": 11.44416618347168, "learning_rate": 2.9931640625e-06, "loss": 2.6667, "step": 28900 }, { "epoch": 2.83203125, "grad_norm": 13.012479782104492, "learning_rate": 2.8304036458333335e-06, "loss": 2.6217, "step": 29000 }, { "epoch": 2.841796875, "grad_norm": 14.73644733428955, "learning_rate": 2.667643229166667e-06, "loss": 2.606, "step": 29100 }, { "epoch": 2.8515625, "grad_norm": 12.075024604797363, "learning_rate": 2.5048828125e-06, "loss": 2.5863, "step": 29200 }, { "epoch": 2.861328125, "grad_norm": 14.664973258972168, "learning_rate": 2.3421223958333335e-06, "loss": 2.7423, "step": 29300 }, { "epoch": 2.87109375, "grad_norm": 14.096378326416016, "learning_rate": 2.179361979166667e-06, "loss": 2.7122, "step": 29400 }, { "epoch": 2.880859375, "grad_norm": 12.812738418579102, "learning_rate": 2.0166015625e-06, "loss": 2.7648, "step": 29500 }, { "epoch": 2.890625, "grad_norm": 16.189590454101562, "learning_rate": 1.8538411458333335e-06, "loss": 2.669, "step": 29600 }, { "epoch": 2.900390625, "grad_norm": 14.232686996459961, "learning_rate": 1.6910807291666667e-06, "loss": 2.6242, "step": 29700 }, { "epoch": 2.91015625, "grad_norm": 16.499113082885742, "learning_rate": 1.5283203125000002e-06, "loss": 2.7261, "step": 29800 }, { "epoch": 2.919921875, "grad_norm": 10.421613693237305, "learning_rate": 1.3655598958333332e-06, "loss": 2.5805, "step": 29900 }, { "epoch": 2.9296875, "grad_norm": 11.867895126342773, "learning_rate": 1.2027994791666667e-06, "loss": 2.6225, "step": 30000 }, { "epoch": 2.939453125, "grad_norm": 14.396392822265625, "learning_rate": 1.0400390625000002e-06, "loss": 2.591, "step": 30100 }, { "epoch": 2.94921875, "grad_norm": 26.637563705444336, "learning_rate": 8.772786458333333e-07, "loss": 2.6324, "step": 30200 }, { "epoch": 2.958984375, "grad_norm": 13.43743896484375, "learning_rate": 7.145182291666667e-07, "loss": 2.7171, "step": 30300 }, { "epoch": 2.96875, "grad_norm": 13.423970222473145, "learning_rate": 5.517578125000001e-07, "loss": 2.5509, "step": 30400 }, { "epoch": 2.978515625, "grad_norm": 12.26883316040039, "learning_rate": 3.889973958333334e-07, "loss": 2.6303, "step": 30500 }, { "epoch": 2.98828125, "grad_norm": 13.305875778198242, "learning_rate": 2.262369791666667e-07, "loss": 2.6542, "step": 30600 }, { "epoch": 2.998046875, "grad_norm": 18.230594635009766, "learning_rate": 6.347656250000001e-08, "loss": 2.6088, "step": 30700 }, { "epoch": 3.0, "step": 30720, "total_flos": 4.206282006594048e+16, "train_loss": 3.0830205624302227, "train_runtime": 6816.3036, "train_samples_per_second": 72.109, "train_steps_per_second": 4.507 } ], "logging_steps": 100, "max_steps": 30720, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.206282006594048e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }