EntropyLong_128K / trainer_state.json
jaber
upload initial model
542997b
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 370.1074523925781,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.3662,
"num_input_tokens_seen": 8388608,
"step": 1
},
{
"epoch": 0.001,
"grad_norm": 380.1726379394531,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.3291,
"num_input_tokens_seen": 16777216,
"step": 2
},
{
"epoch": 0.0015,
"grad_norm": 414.3145446777344,
"learning_rate": 6.000000000000001e-07,
"loss": 3.2201,
"num_input_tokens_seen": 25165824,
"step": 3
},
{
"epoch": 0.002,
"grad_norm": 389.34332275390625,
"learning_rate": 8.000000000000001e-07,
"loss": 3.2735,
"num_input_tokens_seen": 33554432,
"step": 4
},
{
"epoch": 0.0025,
"grad_norm": 385.0985412597656,
"learning_rate": 1.0000000000000002e-06,
"loss": 3.2293,
"num_input_tokens_seen": 41943040,
"step": 5
},
{
"epoch": 0.003,
"grad_norm": 267.23883056640625,
"learning_rate": 1.2000000000000002e-06,
"loss": 3.1203,
"num_input_tokens_seen": 50331648,
"step": 6
},
{
"epoch": 0.0035,
"grad_norm": 249.7722930908203,
"learning_rate": 1.4000000000000001e-06,
"loss": 3.1226,
"num_input_tokens_seen": 58720256,
"step": 7
},
{
"epoch": 0.004,
"grad_norm": 119.61425018310547,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.6311,
"num_input_tokens_seen": 67108864,
"step": 8
},
{
"epoch": 0.0045,
"grad_norm": 108.58890533447266,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.6811,
"num_input_tokens_seen": 75497472,
"step": 9
},
{
"epoch": 0.005,
"grad_norm": 98.306884765625,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.6586,
"num_input_tokens_seen": 83886080,
"step": 10
},
{
"epoch": 0.0055,
"grad_norm": 47.70505142211914,
"learning_rate": 2.2e-06,
"loss": 2.0387,
"num_input_tokens_seen": 92274688,
"step": 11
},
{
"epoch": 0.006,
"grad_norm": 44.65153121948242,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.1509,
"num_input_tokens_seen": 100663296,
"step": 12
},
{
"epoch": 0.0065,
"grad_norm": 36.97843551635742,
"learning_rate": 2.6e-06,
"loss": 2.0554,
"num_input_tokens_seen": 109051904,
"step": 13
},
{
"epoch": 0.007,
"grad_norm": 27.567747116088867,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.9823,
"num_input_tokens_seen": 117440512,
"step": 14
},
{
"epoch": 0.0075,
"grad_norm": 24.83814239501953,
"learning_rate": 3e-06,
"loss": 1.9501,
"num_input_tokens_seen": 125829120,
"step": 15
},
{
"epoch": 0.008,
"grad_norm": 19.835155487060547,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.8774,
"num_input_tokens_seen": 134217728,
"step": 16
},
{
"epoch": 0.0085,
"grad_norm": 14.070314407348633,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.8801,
"num_input_tokens_seen": 142606336,
"step": 17
},
{
"epoch": 0.009,
"grad_norm": 14.519227981567383,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.9272,
"num_input_tokens_seen": 150994944,
"step": 18
},
{
"epoch": 0.0095,
"grad_norm": 14.988875389099121,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.9902,
"num_input_tokens_seen": 159383552,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 9.906997680664062,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9851,
"num_input_tokens_seen": 167772160,
"step": 20
},
{
"epoch": 0.0105,
"grad_norm": 7.837595462799072,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.9127,
"num_input_tokens_seen": 176160768,
"step": 21
},
{
"epoch": 0.011,
"grad_norm": 4.843110084533691,
"learning_rate": 4.4e-06,
"loss": 1.8756,
"num_input_tokens_seen": 184549376,
"step": 22
},
{
"epoch": 0.0115,
"grad_norm": 4.830221176147461,
"learning_rate": 4.600000000000001e-06,
"loss": 1.691,
"num_input_tokens_seen": 192937984,
"step": 23
},
{
"epoch": 0.012,
"grad_norm": 2.5043883323669434,
"learning_rate": 4.800000000000001e-06,
"loss": 1.8997,
"num_input_tokens_seen": 201326592,
"step": 24
},
{
"epoch": 0.0125,
"grad_norm": 2.168509006500244,
"learning_rate": 5e-06,
"loss": 1.8075,
"num_input_tokens_seen": 209715200,
"step": 25
},
{
"epoch": 0.013,
"grad_norm": 2.0097780227661133,
"learning_rate": 5.2e-06,
"loss": 1.8527,
"num_input_tokens_seen": 218103808,
"step": 26
},
{
"epoch": 0.0135,
"grad_norm": 3.2067172527313232,
"learning_rate": 5.400000000000001e-06,
"loss": 1.6836,
"num_input_tokens_seen": 226492416,
"step": 27
},
{
"epoch": 0.014,
"grad_norm": 2.623101234436035,
"learning_rate": 5.600000000000001e-06,
"loss": 1.6736,
"num_input_tokens_seen": 234881024,
"step": 28
},
{
"epoch": 0.0145,
"grad_norm": 3.066000461578369,
"learning_rate": 5.8e-06,
"loss": 1.6637,
"num_input_tokens_seen": 243269632,
"step": 29
},
{
"epoch": 0.015,
"grad_norm": 2.0795412063598633,
"learning_rate": 6e-06,
"loss": 1.6973,
"num_input_tokens_seen": 251658240,
"step": 30
},
{
"epoch": 0.0155,
"grad_norm": 2.8886075019836426,
"learning_rate": 6.200000000000001e-06,
"loss": 1.9608,
"num_input_tokens_seen": 260046848,
"step": 31
},
{
"epoch": 0.016,
"grad_norm": 2.7311999797821045,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.8288,
"num_input_tokens_seen": 268435456,
"step": 32
},
{
"epoch": 0.0165,
"grad_norm": 1.648906946182251,
"learning_rate": 6.600000000000001e-06,
"loss": 1.797,
"num_input_tokens_seen": 276824064,
"step": 33
},
{
"epoch": 0.017,
"grad_norm": 1.3969415426254272,
"learning_rate": 6.800000000000001e-06,
"loss": 1.7316,
"num_input_tokens_seen": 285212672,
"step": 34
},
{
"epoch": 0.0175,
"grad_norm": 1.9224807024002075,
"learning_rate": 7e-06,
"loss": 1.8746,
"num_input_tokens_seen": 293601280,
"step": 35
},
{
"epoch": 0.018,
"grad_norm": 1.349591851234436,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.8283,
"num_input_tokens_seen": 301989888,
"step": 36
},
{
"epoch": 0.0185,
"grad_norm": 1.09238600730896,
"learning_rate": 7.4e-06,
"loss": 2.0065,
"num_input_tokens_seen": 310378496,
"step": 37
},
{
"epoch": 0.019,
"grad_norm": 2.277617931365967,
"learning_rate": 7.600000000000001e-06,
"loss": 1.7303,
"num_input_tokens_seen": 318767104,
"step": 38
},
{
"epoch": 0.0195,
"grad_norm": 1.2706860303878784,
"learning_rate": 7.800000000000002e-06,
"loss": 1.8113,
"num_input_tokens_seen": 327155712,
"step": 39
},
{
"epoch": 0.02,
"grad_norm": 0.9849238395690918,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7541,
"num_input_tokens_seen": 335544320,
"step": 40
},
{
"epoch": 0.0205,
"grad_norm": 0.8385749459266663,
"learning_rate": 8.2e-06,
"loss": 1.5761,
"num_input_tokens_seen": 343932928,
"step": 41
},
{
"epoch": 0.021,
"grad_norm": 1.082259178161621,
"learning_rate": 8.400000000000001e-06,
"loss": 2.0085,
"num_input_tokens_seen": 352321536,
"step": 42
},
{
"epoch": 0.0215,
"grad_norm": 1.1942423582077026,
"learning_rate": 8.6e-06,
"loss": 1.6454,
"num_input_tokens_seen": 360710144,
"step": 43
},
{
"epoch": 0.022,
"grad_norm": 1.2553157806396484,
"learning_rate": 8.8e-06,
"loss": 1.8204,
"num_input_tokens_seen": 369098752,
"step": 44
},
{
"epoch": 0.0225,
"grad_norm": 0.9502832293510437,
"learning_rate": 9e-06,
"loss": 1.7104,
"num_input_tokens_seen": 377487360,
"step": 45
},
{
"epoch": 0.023,
"grad_norm": 0.9545429348945618,
"learning_rate": 9.200000000000002e-06,
"loss": 1.7269,
"num_input_tokens_seen": 385875968,
"step": 46
},
{
"epoch": 0.0235,
"grad_norm": 0.993373692035675,
"learning_rate": 9.4e-06,
"loss": 1.8579,
"num_input_tokens_seen": 394264576,
"step": 47
},
{
"epoch": 0.024,
"grad_norm": 1.3005881309509277,
"learning_rate": 9.600000000000001e-06,
"loss": 1.8702,
"num_input_tokens_seen": 402653184,
"step": 48
},
{
"epoch": 0.0245,
"grad_norm": 1.0115833282470703,
"learning_rate": 9.800000000000001e-06,
"loss": 1.7222,
"num_input_tokens_seen": 411041792,
"step": 49
},
{
"epoch": 0.025,
"grad_norm": 2.056337833404541,
"learning_rate": 1e-05,
"loss": 1.5305,
"num_input_tokens_seen": 419430400,
"step": 50
},
{
"epoch": 0.0255,
"grad_norm": 1.122148036956787,
"learning_rate": 1.02e-05,
"loss": 1.6988,
"num_input_tokens_seen": 427819008,
"step": 51
},
{
"epoch": 0.026,
"grad_norm": 1.1124475002288818,
"learning_rate": 1.04e-05,
"loss": 1.8053,
"num_input_tokens_seen": 436207616,
"step": 52
},
{
"epoch": 0.0265,
"grad_norm": 0.7354093194007874,
"learning_rate": 1.0600000000000002e-05,
"loss": 1.7792,
"num_input_tokens_seen": 444596224,
"step": 53
},
{
"epoch": 0.027,
"grad_norm": 1.4597609043121338,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.7007,
"num_input_tokens_seen": 452984832,
"step": 54
},
{
"epoch": 0.0275,
"grad_norm": 1.0347814559936523,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.7212,
"num_input_tokens_seen": 461373440,
"step": 55
},
{
"epoch": 0.028,
"grad_norm": 1.1491434574127197,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.8258,
"num_input_tokens_seen": 469762048,
"step": 56
},
{
"epoch": 0.0285,
"grad_norm": 1.086042881011963,
"learning_rate": 1.14e-05,
"loss": 1.7252,
"num_input_tokens_seen": 478150656,
"step": 57
},
{
"epoch": 0.029,
"grad_norm": 1.2974258661270142,
"learning_rate": 1.16e-05,
"loss": 1.5316,
"num_input_tokens_seen": 486539264,
"step": 58
},
{
"epoch": 0.0295,
"grad_norm": 1.7874411344528198,
"learning_rate": 1.18e-05,
"loss": 1.7952,
"num_input_tokens_seen": 494927872,
"step": 59
},
{
"epoch": 0.03,
"grad_norm": 1.7470626831054688,
"learning_rate": 1.2e-05,
"loss": 1.4796,
"num_input_tokens_seen": 503316480,
"step": 60
},
{
"epoch": 0.0305,
"grad_norm": 2.2033004760742188,
"learning_rate": 1.22e-05,
"loss": 1.6184,
"num_input_tokens_seen": 511705088,
"step": 61
},
{
"epoch": 0.031,
"grad_norm": 1.5556191205978394,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.749,
"num_input_tokens_seen": 520093696,
"step": 62
},
{
"epoch": 0.0315,
"grad_norm": 1.4915621280670166,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.6627,
"num_input_tokens_seen": 528482304,
"step": 63
},
{
"epoch": 0.032,
"grad_norm": 1.6399468183517456,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.653,
"num_input_tokens_seen": 536870912,
"step": 64
},
{
"epoch": 0.0325,
"grad_norm": 1.6420996189117432,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.7104,
"num_input_tokens_seen": 545259520,
"step": 65
},
{
"epoch": 0.033,
"grad_norm": 1.2998722791671753,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.8448,
"num_input_tokens_seen": 553648128,
"step": 66
},
{
"epoch": 0.0335,
"grad_norm": 1.1958801746368408,
"learning_rate": 1.3400000000000002e-05,
"loss": 1.7359,
"num_input_tokens_seen": 562036736,
"step": 67
},
{
"epoch": 0.034,
"grad_norm": 1.1419895887374878,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.7173,
"num_input_tokens_seen": 570425344,
"step": 68
},
{
"epoch": 0.0345,
"grad_norm": 1.4425727128982544,
"learning_rate": 1.38e-05,
"loss": 1.5033,
"num_input_tokens_seen": 578813952,
"step": 69
},
{
"epoch": 0.035,
"grad_norm": 1.5337550640106201,
"learning_rate": 1.4e-05,
"loss": 1.7122,
"num_input_tokens_seen": 587202560,
"step": 70
},
{
"epoch": 0.0355,
"grad_norm": 0.8321288228034973,
"learning_rate": 1.4200000000000001e-05,
"loss": 1.6591,
"num_input_tokens_seen": 595591168,
"step": 71
},
{
"epoch": 0.036,
"grad_norm": 1.6972109079360962,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.6779,
"num_input_tokens_seen": 603979776,
"step": 72
},
{
"epoch": 0.0365,
"grad_norm": 1.3916453123092651,
"learning_rate": 1.46e-05,
"loss": 1.5565,
"num_input_tokens_seen": 612368384,
"step": 73
},
{
"epoch": 0.037,
"grad_norm": 1.3113203048706055,
"learning_rate": 1.48e-05,
"loss": 1.6577,
"num_input_tokens_seen": 620756992,
"step": 74
},
{
"epoch": 0.0375,
"grad_norm": 1.0601086616516113,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.7513,
"num_input_tokens_seen": 629145600,
"step": 75
},
{
"epoch": 0.038,
"grad_norm": 1.5599173307418823,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.7799,
"num_input_tokens_seen": 637534208,
"step": 76
},
{
"epoch": 0.0385,
"grad_norm": 0.8914459943771362,
"learning_rate": 1.54e-05,
"loss": 1.7983,
"num_input_tokens_seen": 645922816,
"step": 77
},
{
"epoch": 0.039,
"grad_norm": 1.0717469453811646,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.8112,
"num_input_tokens_seen": 654311424,
"step": 78
},
{
"epoch": 0.0395,
"grad_norm": 0.9615082144737244,
"learning_rate": 1.58e-05,
"loss": 1.6442,
"num_input_tokens_seen": 662700032,
"step": 79
},
{
"epoch": 0.04,
"grad_norm": 1.2537950277328491,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.836,
"num_input_tokens_seen": 671088640,
"step": 80
},
{
"epoch": 0.0405,
"grad_norm": 1.1260099411010742,
"learning_rate": 1.62e-05,
"loss": 1.642,
"num_input_tokens_seen": 679477248,
"step": 81
},
{
"epoch": 0.041,
"grad_norm": 1.5180790424346924,
"learning_rate": 1.64e-05,
"loss": 1.6642,
"num_input_tokens_seen": 687865856,
"step": 82
},
{
"epoch": 0.0415,
"grad_norm": 1.1912261247634888,
"learning_rate": 1.66e-05,
"loss": 1.8096,
"num_input_tokens_seen": 696254464,
"step": 83
},
{
"epoch": 0.042,
"grad_norm": 1.0683045387268066,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.7821,
"num_input_tokens_seen": 704643072,
"step": 84
},
{
"epoch": 0.0425,
"grad_norm": 1.234241247177124,
"learning_rate": 1.7e-05,
"loss": 1.7426,
"num_input_tokens_seen": 713031680,
"step": 85
},
{
"epoch": 0.043,
"grad_norm": 1.2772578001022339,
"learning_rate": 1.72e-05,
"loss": 1.6232,
"num_input_tokens_seen": 721420288,
"step": 86
},
{
"epoch": 0.0435,
"grad_norm": 1.238016963005066,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.6116,
"num_input_tokens_seen": 729808896,
"step": 87
},
{
"epoch": 0.044,
"grad_norm": 0.9299501180648804,
"learning_rate": 1.76e-05,
"loss": 1.8499,
"num_input_tokens_seen": 738197504,
"step": 88
},
{
"epoch": 0.0445,
"grad_norm": 1.0743695497512817,
"learning_rate": 1.7800000000000002e-05,
"loss": 1.7032,
"num_input_tokens_seen": 746586112,
"step": 89
},
{
"epoch": 0.045,
"grad_norm": 1.1992101669311523,
"learning_rate": 1.8e-05,
"loss": 1.5495,
"num_input_tokens_seen": 754974720,
"step": 90
},
{
"epoch": 0.0455,
"grad_norm": 1.2545678615570068,
"learning_rate": 1.8200000000000002e-05,
"loss": 1.7138,
"num_input_tokens_seen": 763363328,
"step": 91
},
{
"epoch": 0.046,
"grad_norm": 1.389168381690979,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.7343,
"num_input_tokens_seen": 771751936,
"step": 92
},
{
"epoch": 0.0465,
"grad_norm": 0.9460492730140686,
"learning_rate": 1.86e-05,
"loss": 1.7358,
"num_input_tokens_seen": 780140544,
"step": 93
},
{
"epoch": 0.047,
"grad_norm": 0.9978250861167908,
"learning_rate": 1.88e-05,
"loss": 1.7163,
"num_input_tokens_seen": 788529152,
"step": 94
},
{
"epoch": 0.0475,
"grad_norm": 2.179802656173706,
"learning_rate": 1.9e-05,
"loss": 1.449,
"num_input_tokens_seen": 796917760,
"step": 95
},
{
"epoch": 0.048,
"grad_norm": 1.2629632949829102,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.6696,
"num_input_tokens_seen": 805306368,
"step": 96
},
{
"epoch": 0.0485,
"grad_norm": 1.7194193601608276,
"learning_rate": 1.94e-05,
"loss": 1.6729,
"num_input_tokens_seen": 813694976,
"step": 97
},
{
"epoch": 0.049,
"grad_norm": 1.1110029220581055,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.7822,
"num_input_tokens_seen": 822083584,
"step": 98
},
{
"epoch": 0.0495,
"grad_norm": 1.1010093688964844,
"learning_rate": 1.98e-05,
"loss": 1.5114,
"num_input_tokens_seen": 830472192,
"step": 99
},
{
"epoch": 0.05,
"grad_norm": 1.3180491924285889,
"learning_rate": 2e-05,
"loss": 1.5707,
"num_input_tokens_seen": 838860800,
"step": 100
},
{
"epoch": 0.0505,
"grad_norm": 1.1497364044189453,
"learning_rate": 2.0200000000000003e-05,
"loss": 1.6285,
"num_input_tokens_seen": 847249408,
"step": 101
},
{
"epoch": 0.051,
"grad_norm": 1.4788284301757812,
"learning_rate": 2.04e-05,
"loss": 1.7649,
"num_input_tokens_seen": 855638016,
"step": 102
},
{
"epoch": 0.0515,
"grad_norm": 1.2098913192749023,
"learning_rate": 2.0600000000000003e-05,
"loss": 1.6458,
"num_input_tokens_seen": 864026624,
"step": 103
},
{
"epoch": 0.052,
"grad_norm": 1.4255143404006958,
"learning_rate": 2.08e-05,
"loss": 1.6966,
"num_input_tokens_seen": 872415232,
"step": 104
},
{
"epoch": 0.0525,
"grad_norm": 1.2970800399780273,
"learning_rate": 2.1000000000000002e-05,
"loss": 1.6893,
"num_input_tokens_seen": 880803840,
"step": 105
},
{
"epoch": 0.053,
"grad_norm": 1.4322800636291504,
"learning_rate": 2.1200000000000004e-05,
"loss": 1.7252,
"num_input_tokens_seen": 889192448,
"step": 106
},
{
"epoch": 0.0535,
"grad_norm": 1.0862427949905396,
"learning_rate": 2.1400000000000002e-05,
"loss": 1.702,
"num_input_tokens_seen": 897581056,
"step": 107
},
{
"epoch": 0.054,
"grad_norm": 1.2055437564849854,
"learning_rate": 2.1600000000000003e-05,
"loss": 1.6486,
"num_input_tokens_seen": 905969664,
"step": 108
},
{
"epoch": 0.0545,
"grad_norm": 1.0794836282730103,
"learning_rate": 2.1800000000000005e-05,
"loss": 1.6812,
"num_input_tokens_seen": 914358272,
"step": 109
},
{
"epoch": 0.055,
"grad_norm": 1.3784294128417969,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.5237,
"num_input_tokens_seen": 922746880,
"step": 110
},
{
"epoch": 0.0555,
"grad_norm": 1.5001763105392456,
"learning_rate": 2.2200000000000004e-05,
"loss": 1.6246,
"num_input_tokens_seen": 931135488,
"step": 111
},
{
"epoch": 0.056,
"grad_norm": 1.0864592790603638,
"learning_rate": 2.2400000000000002e-05,
"loss": 1.6973,
"num_input_tokens_seen": 939524096,
"step": 112
},
{
"epoch": 0.0565,
"grad_norm": 1.2886756658554077,
"learning_rate": 2.26e-05,
"loss": 1.6475,
"num_input_tokens_seen": 947912704,
"step": 113
},
{
"epoch": 0.057,
"grad_norm": 1.4063009023666382,
"learning_rate": 2.28e-05,
"loss": 1.7949,
"num_input_tokens_seen": 956301312,
"step": 114
},
{
"epoch": 0.0575,
"grad_norm": 1.1816203594207764,
"learning_rate": 2.3e-05,
"loss": 1.6827,
"num_input_tokens_seen": 964689920,
"step": 115
},
{
"epoch": 0.058,
"grad_norm": 1.368019938468933,
"learning_rate": 2.32e-05,
"loss": 1.5163,
"num_input_tokens_seen": 973078528,
"step": 116
},
{
"epoch": 0.0585,
"grad_norm": 1.1925565004348755,
"learning_rate": 2.34e-05,
"loss": 1.7815,
"num_input_tokens_seen": 981467136,
"step": 117
},
{
"epoch": 0.059,
"grad_norm": 1.7603892087936401,
"learning_rate": 2.36e-05,
"loss": 1.5604,
"num_input_tokens_seen": 989855744,
"step": 118
},
{
"epoch": 0.0595,
"grad_norm": 1.140766978263855,
"learning_rate": 2.3800000000000003e-05,
"loss": 1.7546,
"num_input_tokens_seen": 998244352,
"step": 119
},
{
"epoch": 0.06,
"grad_norm": 1.0011487007141113,
"learning_rate": 2.4e-05,
"loss": 1.7174,
"num_input_tokens_seen": 1006632960,
"step": 120
},
{
"epoch": 0.0605,
"grad_norm": 1.3519712686538696,
"learning_rate": 2.4200000000000002e-05,
"loss": 1.6414,
"num_input_tokens_seen": 1015021568,
"step": 121
},
{
"epoch": 0.061,
"grad_norm": 1.0491664409637451,
"learning_rate": 2.44e-05,
"loss": 1.698,
"num_input_tokens_seen": 1023410176,
"step": 122
},
{
"epoch": 0.0615,
"grad_norm": 1.5017552375793457,
"learning_rate": 2.46e-05,
"loss": 1.7974,
"num_input_tokens_seen": 1031798784,
"step": 123
},
{
"epoch": 0.062,
"grad_norm": 0.9958243370056152,
"learning_rate": 2.4800000000000003e-05,
"loss": 1.6468,
"num_input_tokens_seen": 1040187392,
"step": 124
},
{
"epoch": 0.0625,
"grad_norm": 2.0464437007904053,
"learning_rate": 2.5e-05,
"loss": 1.6911,
"num_input_tokens_seen": 1048576000,
"step": 125
},
{
"epoch": 0.063,
"grad_norm": 1.2215771675109863,
"learning_rate": 2.5200000000000003e-05,
"loss": 1.7483,
"num_input_tokens_seen": 1056964608,
"step": 126
},
{
"epoch": 0.0635,
"grad_norm": 1.4541150331497192,
"learning_rate": 2.54e-05,
"loss": 1.7045,
"num_input_tokens_seen": 1065353216,
"step": 127
},
{
"epoch": 0.064,
"grad_norm": 1.6184258460998535,
"learning_rate": 2.5600000000000002e-05,
"loss": 1.7767,
"num_input_tokens_seen": 1073741824,
"step": 128
},
{
"epoch": 0.0645,
"grad_norm": 1.029843807220459,
"learning_rate": 2.5800000000000004e-05,
"loss": 1.7934,
"num_input_tokens_seen": 1082130432,
"step": 129
},
{
"epoch": 0.065,
"grad_norm": 2.1942873001098633,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.5101,
"num_input_tokens_seen": 1090519040,
"step": 130
},
{
"epoch": 0.0655,
"grad_norm": 1.411434292793274,
"learning_rate": 2.6200000000000003e-05,
"loss": 1.6257,
"num_input_tokens_seen": 1098907648,
"step": 131
},
{
"epoch": 0.066,
"grad_norm": 0.9594506621360779,
"learning_rate": 2.6400000000000005e-05,
"loss": 1.8266,
"num_input_tokens_seen": 1107296256,
"step": 132
},
{
"epoch": 0.0665,
"grad_norm": 1.4641342163085938,
"learning_rate": 2.6600000000000003e-05,
"loss": 1.712,
"num_input_tokens_seen": 1115684864,
"step": 133
},
{
"epoch": 0.067,
"grad_norm": 1.3084245920181274,
"learning_rate": 2.6800000000000004e-05,
"loss": 1.604,
"num_input_tokens_seen": 1124073472,
"step": 134
},
{
"epoch": 0.0675,
"grad_norm": 1.2237942218780518,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.5766,
"num_input_tokens_seen": 1132462080,
"step": 135
},
{
"epoch": 0.068,
"grad_norm": 1.1463581323623657,
"learning_rate": 2.7200000000000004e-05,
"loss": 1.6984,
"num_input_tokens_seen": 1140850688,
"step": 136
},
{
"epoch": 0.0685,
"grad_norm": 1.579965591430664,
"learning_rate": 2.7400000000000005e-05,
"loss": 1.7595,
"num_input_tokens_seen": 1149239296,
"step": 137
},
{
"epoch": 0.069,
"grad_norm": 1.154305338859558,
"learning_rate": 2.76e-05,
"loss": 1.7695,
"num_input_tokens_seen": 1157627904,
"step": 138
},
{
"epoch": 0.0695,
"grad_norm": 1.2724696397781372,
"learning_rate": 2.78e-05,
"loss": 1.6492,
"num_input_tokens_seen": 1166016512,
"step": 139
},
{
"epoch": 0.07,
"grad_norm": 1.0162241458892822,
"learning_rate": 2.8e-05,
"loss": 1.659,
"num_input_tokens_seen": 1174405120,
"step": 140
},
{
"epoch": 0.0705,
"grad_norm": 1.206682562828064,
"learning_rate": 2.82e-05,
"loss": 1.8174,
"num_input_tokens_seen": 1182793728,
"step": 141
},
{
"epoch": 0.071,
"grad_norm": 0.8725315928459167,
"learning_rate": 2.8400000000000003e-05,
"loss": 1.5187,
"num_input_tokens_seen": 1191182336,
"step": 142
},
{
"epoch": 0.0715,
"grad_norm": 1.4840545654296875,
"learning_rate": 2.86e-05,
"loss": 1.5984,
"num_input_tokens_seen": 1199570944,
"step": 143
},
{
"epoch": 0.072,
"grad_norm": 1.3734935522079468,
"learning_rate": 2.8800000000000002e-05,
"loss": 1.5661,
"num_input_tokens_seen": 1207959552,
"step": 144
},
{
"epoch": 0.0725,
"grad_norm": 1.2102882862091064,
"learning_rate": 2.9e-05,
"loss": 1.6857,
"num_input_tokens_seen": 1216348160,
"step": 145
},
{
"epoch": 0.073,
"grad_norm": 1.3926680088043213,
"learning_rate": 2.92e-05,
"loss": 1.695,
"num_input_tokens_seen": 1224736768,
"step": 146
},
{
"epoch": 0.0735,
"grad_norm": 1.3040108680725098,
"learning_rate": 2.9400000000000003e-05,
"loss": 1.6561,
"num_input_tokens_seen": 1233125376,
"step": 147
},
{
"epoch": 0.074,
"grad_norm": 1.168774127960205,
"learning_rate": 2.96e-05,
"loss": 1.7053,
"num_input_tokens_seen": 1241513984,
"step": 148
},
{
"epoch": 0.0745,
"grad_norm": 1.0817087888717651,
"learning_rate": 2.9800000000000003e-05,
"loss": 1.6884,
"num_input_tokens_seen": 1249902592,
"step": 149
},
{
"epoch": 0.075,
"grad_norm": 1.2117892503738403,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.6538,
"num_input_tokens_seen": 1258291200,
"step": 150
},
{
"epoch": 0.0755,
"grad_norm": 1.4789024591445923,
"learning_rate": 3.0200000000000002e-05,
"loss": 1.6475,
"num_input_tokens_seen": 1266679808,
"step": 151
},
{
"epoch": 0.076,
"grad_norm": 1.3695178031921387,
"learning_rate": 3.0400000000000004e-05,
"loss": 1.725,
"num_input_tokens_seen": 1275068416,
"step": 152
},
{
"epoch": 0.0765,
"grad_norm": 1.3215945959091187,
"learning_rate": 3.0600000000000005e-05,
"loss": 1.6756,
"num_input_tokens_seen": 1283457024,
"step": 153
},
{
"epoch": 0.077,
"grad_norm": 1.1858457326889038,
"learning_rate": 3.08e-05,
"loss": 1.755,
"num_input_tokens_seen": 1291845632,
"step": 154
},
{
"epoch": 0.0775,
"grad_norm": 1.4984840154647827,
"learning_rate": 3.1e-05,
"loss": 1.7219,
"num_input_tokens_seen": 1300234240,
"step": 155
},
{
"epoch": 0.078,
"grad_norm": 1.2197405099868774,
"learning_rate": 3.1200000000000006e-05,
"loss": 1.7356,
"num_input_tokens_seen": 1308622848,
"step": 156
},
{
"epoch": 0.0785,
"grad_norm": 1.474221110343933,
"learning_rate": 3.1400000000000004e-05,
"loss": 1.5129,
"num_input_tokens_seen": 1317011456,
"step": 157
},
{
"epoch": 0.079,
"grad_norm": 1.5716886520385742,
"learning_rate": 3.16e-05,
"loss": 1.7484,
"num_input_tokens_seen": 1325400064,
"step": 158
},
{
"epoch": 0.0795,
"grad_norm": 0.9521049857139587,
"learning_rate": 3.180000000000001e-05,
"loss": 1.6262,
"num_input_tokens_seen": 1333788672,
"step": 159
},
{
"epoch": 0.08,
"grad_norm": 1.921792984008789,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.5914,
"num_input_tokens_seen": 1342177280,
"step": 160
},
{
"epoch": 0.0805,
"grad_norm": 1.6440759897232056,
"learning_rate": 3.2200000000000003e-05,
"loss": 1.7481,
"num_input_tokens_seen": 1350565888,
"step": 161
},
{
"epoch": 0.081,
"grad_norm": 1.7987762689590454,
"learning_rate": 3.24e-05,
"loss": 1.6367,
"num_input_tokens_seen": 1358954496,
"step": 162
},
{
"epoch": 0.0815,
"grad_norm": 1.4722107648849487,
"learning_rate": 3.26e-05,
"loss": 1.6079,
"num_input_tokens_seen": 1367343104,
"step": 163
},
{
"epoch": 0.082,
"grad_norm": 1.5897142887115479,
"learning_rate": 3.28e-05,
"loss": 1.5902,
"num_input_tokens_seen": 1375731712,
"step": 164
},
{
"epoch": 0.0825,
"grad_norm": 1.5487194061279297,
"learning_rate": 3.3e-05,
"loss": 1.8002,
"num_input_tokens_seen": 1384120320,
"step": 165
},
{
"epoch": 0.083,
"grad_norm": 1.122914433479309,
"learning_rate": 3.32e-05,
"loss": 1.6447,
"num_input_tokens_seen": 1392508928,
"step": 166
},
{
"epoch": 0.0835,
"grad_norm": 1.5491061210632324,
"learning_rate": 3.34e-05,
"loss": 1.7569,
"num_input_tokens_seen": 1400897536,
"step": 167
},
{
"epoch": 0.084,
"grad_norm": 1.2212718725204468,
"learning_rate": 3.3600000000000004e-05,
"loss": 1.5337,
"num_input_tokens_seen": 1409286144,
"step": 168
},
{
"epoch": 0.0845,
"grad_norm": 1.2301714420318604,
"learning_rate": 3.38e-05,
"loss": 1.6417,
"num_input_tokens_seen": 1417674752,
"step": 169
},
{
"epoch": 0.085,
"grad_norm": 1.242119312286377,
"learning_rate": 3.4e-05,
"loss": 1.8315,
"num_input_tokens_seen": 1426063360,
"step": 170
},
{
"epoch": 0.0855,
"grad_norm": 1.4329941272735596,
"learning_rate": 3.4200000000000005e-05,
"loss": 1.682,
"num_input_tokens_seen": 1434451968,
"step": 171
},
{
"epoch": 0.086,
"grad_norm": 1.633201241493225,
"learning_rate": 3.44e-05,
"loss": 1.6389,
"num_input_tokens_seen": 1442840576,
"step": 172
},
{
"epoch": 0.0865,
"grad_norm": 1.184312343597412,
"learning_rate": 3.46e-05,
"loss": 1.727,
"num_input_tokens_seen": 1451229184,
"step": 173
},
{
"epoch": 0.087,
"grad_norm": 1.2135035991668701,
"learning_rate": 3.4800000000000006e-05,
"loss": 1.7893,
"num_input_tokens_seen": 1459617792,
"step": 174
},
{
"epoch": 0.0875,
"grad_norm": 0.9943047761917114,
"learning_rate": 3.5000000000000004e-05,
"loss": 1.8099,
"num_input_tokens_seen": 1468006400,
"step": 175
},
{
"epoch": 0.088,
"grad_norm": 1.4616221189498901,
"learning_rate": 3.52e-05,
"loss": 1.7955,
"num_input_tokens_seen": 1476395008,
"step": 176
},
{
"epoch": 0.0885,
"grad_norm": 0.9395477771759033,
"learning_rate": 3.54e-05,
"loss": 1.5458,
"num_input_tokens_seen": 1484783616,
"step": 177
},
{
"epoch": 0.089,
"grad_norm": 1.5338860750198364,
"learning_rate": 3.5600000000000005e-05,
"loss": 1.7537,
"num_input_tokens_seen": 1493172224,
"step": 178
},
{
"epoch": 0.0895,
"grad_norm": 1.1360806226730347,
"learning_rate": 3.58e-05,
"loss": 1.58,
"num_input_tokens_seen": 1501560832,
"step": 179
},
{
"epoch": 0.09,
"grad_norm": 1.797577142715454,
"learning_rate": 3.6e-05,
"loss": 1.7989,
"num_input_tokens_seen": 1509949440,
"step": 180
},
{
"epoch": 0.0905,
"grad_norm": 1.5540266036987305,
"learning_rate": 3.6200000000000006e-05,
"loss": 1.4828,
"num_input_tokens_seen": 1518338048,
"step": 181
},
{
"epoch": 0.091,
"grad_norm": 1.776328206062317,
"learning_rate": 3.6400000000000004e-05,
"loss": 1.6651,
"num_input_tokens_seen": 1526726656,
"step": 182
},
{
"epoch": 0.0915,
"grad_norm": 1.5140923261642456,
"learning_rate": 3.66e-05,
"loss": 1.702,
"num_input_tokens_seen": 1535115264,
"step": 183
},
{
"epoch": 0.092,
"grad_norm": 1.5927739143371582,
"learning_rate": 3.680000000000001e-05,
"loss": 1.6738,
"num_input_tokens_seen": 1543503872,
"step": 184
},
{
"epoch": 0.0925,
"grad_norm": 1.1876591444015503,
"learning_rate": 3.7000000000000005e-05,
"loss": 1.673,
"num_input_tokens_seen": 1551892480,
"step": 185
},
{
"epoch": 0.093,
"grad_norm": 1.176761269569397,
"learning_rate": 3.72e-05,
"loss": 1.7108,
"num_input_tokens_seen": 1560281088,
"step": 186
},
{
"epoch": 0.0935,
"grad_norm": 1.4902335405349731,
"learning_rate": 3.740000000000001e-05,
"loss": 1.8053,
"num_input_tokens_seen": 1568669696,
"step": 187
},
{
"epoch": 0.094,
"grad_norm": 1.1420718431472778,
"learning_rate": 3.76e-05,
"loss": 1.7571,
"num_input_tokens_seen": 1577058304,
"step": 188
},
{
"epoch": 0.0945,
"grad_norm": 0.99260413646698,
"learning_rate": 3.7800000000000004e-05,
"loss": 1.6794,
"num_input_tokens_seen": 1585446912,
"step": 189
},
{
"epoch": 0.095,
"grad_norm": 1.57101309299469,
"learning_rate": 3.8e-05,
"loss": 1.6479,
"num_input_tokens_seen": 1593835520,
"step": 190
},
{
"epoch": 0.0955,
"grad_norm": 1.5510618686676025,
"learning_rate": 3.82e-05,
"loss": 1.6632,
"num_input_tokens_seen": 1602224128,
"step": 191
},
{
"epoch": 0.096,
"grad_norm": 1.442740559577942,
"learning_rate": 3.8400000000000005e-05,
"loss": 1.8864,
"num_input_tokens_seen": 1610612736,
"step": 192
},
{
"epoch": 0.0965,
"grad_norm": 1.0593072175979614,
"learning_rate": 3.86e-05,
"loss": 1.5798,
"num_input_tokens_seen": 1619001344,
"step": 193
},
{
"epoch": 0.097,
"grad_norm": 2.065847396850586,
"learning_rate": 3.88e-05,
"loss": 1.5443,
"num_input_tokens_seen": 1627389952,
"step": 194
},
{
"epoch": 0.0975,
"grad_norm": 1.2346482276916504,
"learning_rate": 3.9e-05,
"loss": 1.6052,
"num_input_tokens_seen": 1635778560,
"step": 195
},
{
"epoch": 0.098,
"grad_norm": 1.419586181640625,
"learning_rate": 3.9200000000000004e-05,
"loss": 1.5901,
"num_input_tokens_seen": 1644167168,
"step": 196
},
{
"epoch": 0.0985,
"grad_norm": 1.2483519315719604,
"learning_rate": 3.94e-05,
"loss": 1.6552,
"num_input_tokens_seen": 1652555776,
"step": 197
},
{
"epoch": 0.099,
"grad_norm": 1.5695958137512207,
"learning_rate": 3.96e-05,
"loss": 1.5986,
"num_input_tokens_seen": 1660944384,
"step": 198
},
{
"epoch": 0.0995,
"grad_norm": 1.6311231851577759,
"learning_rate": 3.9800000000000005e-05,
"loss": 1.778,
"num_input_tokens_seen": 1669332992,
"step": 199
},
{
"epoch": 0.1,
"grad_norm": 1.5190160274505615,
"learning_rate": 4e-05,
"loss": 1.8327,
"num_input_tokens_seen": 1677721600,
"step": 200
},
{
"epoch": 0.1005,
"grad_norm": 1.2366491556167603,
"learning_rate": 3.9999845787629415e-05,
"loss": 1.7143,
"num_input_tokens_seen": 1686110208,
"step": 201
},
{
"epoch": 0.101,
"grad_norm": 2.084810495376587,
"learning_rate": 3.99993831528958e-05,
"loss": 1.6544,
"num_input_tokens_seen": 1694498816,
"step": 202
},
{
"epoch": 0.1015,
"grad_norm": 1.5253264904022217,
"learning_rate": 3.9998612102933544e-05,
"loss": 1.6143,
"num_input_tokens_seen": 1702887424,
"step": 203
},
{
"epoch": 0.102,
"grad_norm": 1.0702577829360962,
"learning_rate": 3.999753264963321e-05,
"loss": 1.6856,
"num_input_tokens_seen": 1711276032,
"step": 204
},
{
"epoch": 0.1025,
"grad_norm": 1.9343763589859009,
"learning_rate": 3.9996144809641296e-05,
"loss": 1.786,
"num_input_tokens_seen": 1719664640,
"step": 205
},
{
"epoch": 0.103,
"grad_norm": 1.1403006315231323,
"learning_rate": 3.9994448604360016e-05,
"loss": 1.6452,
"num_input_tokens_seen": 1728053248,
"step": 206
},
{
"epoch": 0.1035,
"grad_norm": 2.63930606842041,
"learning_rate": 3.999244405994694e-05,
"loss": 1.7236,
"num_input_tokens_seen": 1736441856,
"step": 207
},
{
"epoch": 0.104,
"grad_norm": 1.6718311309814453,
"learning_rate": 3.9990131207314634e-05,
"loss": 1.5877,
"num_input_tokens_seen": 1744830464,
"step": 208
},
{
"epoch": 0.1045,
"grad_norm": 2.7320733070373535,
"learning_rate": 3.998751008213014e-05,
"loss": 1.7808,
"num_input_tokens_seen": 1753219072,
"step": 209
},
{
"epoch": 0.105,
"grad_norm": 2.0388472080230713,
"learning_rate": 3.9984580724814464e-05,
"loss": 1.7625,
"num_input_tokens_seen": 1761607680,
"step": 210
},
{
"epoch": 0.1055,
"grad_norm": 2.2198638916015625,
"learning_rate": 3.99813431805419e-05,
"loss": 1.6662,
"num_input_tokens_seen": 1769996288,
"step": 211
},
{
"epoch": 0.106,
"grad_norm": 1.7465423345565796,
"learning_rate": 3.9977797499239404e-05,
"loss": 1.6335,
"num_input_tokens_seen": 1778384896,
"step": 212
},
{
"epoch": 0.1065,
"grad_norm": 1.6627084016799927,
"learning_rate": 3.997394373558576e-05,
"loss": 1.6865,
"num_input_tokens_seen": 1786773504,
"step": 213
},
{
"epoch": 0.107,
"grad_norm": 1.486066222190857,
"learning_rate": 3.996978194901077e-05,
"loss": 1.5947,
"num_input_tokens_seen": 1795162112,
"step": 214
},
{
"epoch": 0.1075,
"grad_norm": 1.2178465127944946,
"learning_rate": 3.996531220369432e-05,
"loss": 1.8448,
"num_input_tokens_seen": 1803550720,
"step": 215
},
{
"epoch": 0.108,
"grad_norm": 1.1107075214385986,
"learning_rate": 3.9960534568565436e-05,
"loss": 1.7868,
"num_input_tokens_seen": 1811939328,
"step": 216
},
{
"epoch": 0.1085,
"grad_norm": 1.3619362115859985,
"learning_rate": 3.995544911730115e-05,
"loss": 1.742,
"num_input_tokens_seen": 1820327936,
"step": 217
},
{
"epoch": 0.109,
"grad_norm": 1.0947133302688599,
"learning_rate": 3.995005592832541e-05,
"loss": 1.7386,
"num_input_tokens_seen": 1828716544,
"step": 218
},
{
"epoch": 0.1095,
"grad_norm": 1.4069256782531738,
"learning_rate": 3.994435508480786e-05,
"loss": 1.7462,
"num_input_tokens_seen": 1837105152,
"step": 219
},
{
"epoch": 0.11,
"grad_norm": 1.4184787273406982,
"learning_rate": 3.9938346674662565e-05,
"loss": 1.7545,
"num_input_tokens_seen": 1845493760,
"step": 220
},
{
"epoch": 0.1105,
"grad_norm": 1.4119457006454468,
"learning_rate": 3.9932030790546636e-05,
"loss": 1.6184,
"num_input_tokens_seen": 1853882368,
"step": 221
},
{
"epoch": 0.111,
"grad_norm": 1.1133450269699097,
"learning_rate": 3.9925407529858826e-05,
"loss": 1.67,
"num_input_tokens_seen": 1862270976,
"step": 222
},
{
"epoch": 0.1115,
"grad_norm": 1.3021692037582397,
"learning_rate": 3.991847699473801e-05,
"loss": 1.6534,
"num_input_tokens_seen": 1870659584,
"step": 223
},
{
"epoch": 0.112,
"grad_norm": 1.10936439037323,
"learning_rate": 3.99112392920616e-05,
"loss": 1.7029,
"num_input_tokens_seen": 1879048192,
"step": 224
},
{
"epoch": 0.1125,
"grad_norm": 0.9188879728317261,
"learning_rate": 3.990369453344394e-05,
"loss": 1.6314,
"num_input_tokens_seen": 1887436800,
"step": 225
},
{
"epoch": 0.113,
"grad_norm": 1.4214354753494263,
"learning_rate": 3.989584283523453e-05,
"loss": 1.6977,
"num_input_tokens_seen": 1895825408,
"step": 226
},
{
"epoch": 0.1135,
"grad_norm": 1.3106968402862549,
"learning_rate": 3.988768431851628e-05,
"loss": 1.613,
"num_input_tokens_seen": 1904214016,
"step": 227
},
{
"epoch": 0.114,
"grad_norm": 1.2800140380859375,
"learning_rate": 3.98792191091036e-05,
"loss": 1.7323,
"num_input_tokens_seen": 1912602624,
"step": 228
},
{
"epoch": 0.1145,
"grad_norm": 0.9443020224571228,
"learning_rate": 3.987044733754049e-05,
"loss": 1.5173,
"num_input_tokens_seen": 1920991232,
"step": 229
},
{
"epoch": 0.115,
"grad_norm": 1.4971925020217896,
"learning_rate": 3.986136913909853e-05,
"loss": 1.8041,
"num_input_tokens_seen": 1929379840,
"step": 230
},
{
"epoch": 0.1155,
"grad_norm": 0.9276608824729919,
"learning_rate": 3.985198465377476e-05,
"loss": 1.8106,
"num_input_tokens_seen": 1937768448,
"step": 231
},
{
"epoch": 0.116,
"grad_norm": 1.3482216596603394,
"learning_rate": 3.9842294026289565e-05,
"loss": 1.6773,
"num_input_tokens_seen": 1946157056,
"step": 232
},
{
"epoch": 0.1165,
"grad_norm": 1.3610389232635498,
"learning_rate": 3.9832297406084386e-05,
"loss": 1.631,
"num_input_tokens_seen": 1954545664,
"step": 233
},
{
"epoch": 0.117,
"grad_norm": 1.1721453666687012,
"learning_rate": 3.98219949473195e-05,
"loss": 1.7756,
"num_input_tokens_seen": 1962934272,
"step": 234
},
{
"epoch": 0.1175,
"grad_norm": 0.8579085469245911,
"learning_rate": 3.981138680887154e-05,
"loss": 1.8969,
"num_input_tokens_seen": 1971322880,
"step": 235
},
{
"epoch": 0.118,
"grad_norm": 0.8704617023468018,
"learning_rate": 3.980047315433116e-05,
"loss": 1.6432,
"num_input_tokens_seen": 1979711488,
"step": 236
},
{
"epoch": 0.1185,
"grad_norm": 1.1833432912826538,
"learning_rate": 3.978925415200037e-05,
"loss": 1.6492,
"num_input_tokens_seen": 1988100096,
"step": 237
},
{
"epoch": 0.119,
"grad_norm": 1.2760549783706665,
"learning_rate": 3.97777299748901e-05,
"loss": 1.6491,
"num_input_tokens_seen": 1996488704,
"step": 238
},
{
"epoch": 0.1195,
"grad_norm": 0.8377672433853149,
"learning_rate": 3.976590080071739e-05,
"loss": 1.7364,
"num_input_tokens_seen": 2004877312,
"step": 239
},
{
"epoch": 0.12,
"grad_norm": 1.6421141624450684,
"learning_rate": 3.9753766811902756e-05,
"loss": 1.4844,
"num_input_tokens_seen": 2013265920,
"step": 240
},
{
"epoch": 0.1205,
"grad_norm": 1.1873599290847778,
"learning_rate": 3.974132819556731e-05,
"loss": 1.7483,
"num_input_tokens_seen": 2021654528,
"step": 241
},
{
"epoch": 0.121,
"grad_norm": 1.206591248512268,
"learning_rate": 3.972858514352991e-05,
"loss": 1.7342,
"num_input_tokens_seen": 2030043136,
"step": 242
},
{
"epoch": 0.1215,
"grad_norm": 0.9489218592643738,
"learning_rate": 3.971553785230418e-05,
"loss": 1.7117,
"num_input_tokens_seen": 2038431744,
"step": 243
},
{
"epoch": 0.122,
"grad_norm": 1.023648977279663,
"learning_rate": 3.970218652309548e-05,
"loss": 1.5842,
"num_input_tokens_seen": 2046820352,
"step": 244
},
{
"epoch": 0.1225,
"grad_norm": 1.1487878561019897,
"learning_rate": 3.9688531361797834e-05,
"loss": 1.7356,
"num_input_tokens_seen": 2055208960,
"step": 245
},
{
"epoch": 0.123,
"grad_norm": 1.0160244703292847,
"learning_rate": 3.9674572578990724e-05,
"loss": 1.682,
"num_input_tokens_seen": 2063597568,
"step": 246
},
{
"epoch": 0.1235,
"grad_norm": 1.105434536933899,
"learning_rate": 3.9660310389935837e-05,
"loss": 1.6695,
"num_input_tokens_seen": 2071986176,
"step": 247
},
{
"epoch": 0.124,
"grad_norm": 1.0896682739257812,
"learning_rate": 3.964574501457378e-05,
"loss": 1.7885,
"num_input_tokens_seen": 2080374784,
"step": 248
},
{
"epoch": 0.1245,
"grad_norm": 1.0165361166000366,
"learning_rate": 3.9630876677520656e-05,
"loss": 1.579,
"num_input_tokens_seen": 2088763392,
"step": 249
},
{
"epoch": 0.125,
"grad_norm": 1.211041808128357,
"learning_rate": 3.961570560806461e-05,
"loss": 1.6509,
"num_input_tokens_seen": 2097152000,
"step": 250
},
{
"epoch": 0.1255,
"grad_norm": 1.0447190999984741,
"learning_rate": 3.960023204016231e-05,
"loss": 1.8698,
"num_input_tokens_seen": 2105540608,
"step": 251
},
{
"epoch": 0.126,
"grad_norm": 1.3021482229232788,
"learning_rate": 3.958445621243532e-05,
"loss": 1.7579,
"num_input_tokens_seen": 2113929216,
"step": 252
},
{
"epoch": 0.1265,
"grad_norm": 0.9119377732276917,
"learning_rate": 3.9568378368166406e-05,
"loss": 1.6945,
"num_input_tokens_seen": 2122317824,
"step": 253
},
{
"epoch": 0.127,
"grad_norm": 0.8316543698310852,
"learning_rate": 3.955199875529582e-05,
"loss": 1.6739,
"num_input_tokens_seen": 2130706432,
"step": 254
},
{
"epoch": 0.1275,
"grad_norm": 0.9795516133308411,
"learning_rate": 3.953531762641745e-05,
"loss": 1.7518,
"num_input_tokens_seen": 2139095040,
"step": 255
},
{
"epoch": 0.128,
"grad_norm": 0.9370044469833374,
"learning_rate": 3.951833523877495e-05,
"loss": 1.6134,
"num_input_tokens_seen": 2147483648,
"step": 256
},
{
"epoch": 0.1285,
"grad_norm": 1.6189143657684326,
"learning_rate": 3.9501051854257745e-05,
"loss": 1.6523,
"num_input_tokens_seen": 2155872256,
"step": 257
},
{
"epoch": 0.129,
"grad_norm": 1.1166945695877075,
"learning_rate": 3.948346773939699e-05,
"loss": 1.7523,
"num_input_tokens_seen": 2164260864,
"step": 258
},
{
"epoch": 0.1295,
"grad_norm": 1.0886536836624146,
"learning_rate": 3.94655831653615e-05,
"loss": 1.622,
"num_input_tokens_seen": 2172649472,
"step": 259
},
{
"epoch": 0.13,
"grad_norm": 1.7367981672286987,
"learning_rate": 3.9447398407953536e-05,
"loss": 1.7358,
"num_input_tokens_seen": 2181038080,
"step": 260
},
{
"epoch": 0.1305,
"grad_norm": 1.2050344944000244,
"learning_rate": 3.942891374760455e-05,
"loss": 1.5398,
"num_input_tokens_seen": 2189426688,
"step": 261
},
{
"epoch": 0.131,
"grad_norm": 2.18412709236145,
"learning_rate": 3.941012946937085e-05,
"loss": 1.5405,
"num_input_tokens_seen": 2197815296,
"step": 262
},
{
"epoch": 0.1315,
"grad_norm": 1.3102896213531494,
"learning_rate": 3.9391045862929275e-05,
"loss": 1.8383,
"num_input_tokens_seen": 2206203904,
"step": 263
},
{
"epoch": 0.132,
"grad_norm": 1.4639437198638916,
"learning_rate": 3.9371663222572625e-05,
"loss": 1.6872,
"num_input_tokens_seen": 2214592512,
"step": 264
},
{
"epoch": 0.1325,
"grad_norm": 1.346665859222412,
"learning_rate": 3.93519818472052e-05,
"loss": 1.7861,
"num_input_tokens_seen": 2222981120,
"step": 265
},
{
"epoch": 0.133,
"grad_norm": 1.4441630840301514,
"learning_rate": 3.933200204033815e-05,
"loss": 1.4984,
"num_input_tokens_seen": 2231369728,
"step": 266
},
{
"epoch": 0.1335,
"grad_norm": 1.3043477535247803,
"learning_rate": 3.931172411008482e-05,
"loss": 1.7591,
"num_input_tokens_seen": 2239758336,
"step": 267
},
{
"epoch": 0.134,
"grad_norm": 1.2089747190475464,
"learning_rate": 3.9291148369155964e-05,
"loss": 1.7255,
"num_input_tokens_seen": 2248146944,
"step": 268
},
{
"epoch": 0.1345,
"grad_norm": 0.9772351980209351,
"learning_rate": 3.927027513485498e-05,
"loss": 1.6875,
"num_input_tokens_seen": 2256535552,
"step": 269
},
{
"epoch": 0.135,
"grad_norm": 0.9826438426971436,
"learning_rate": 3.9249104729072944e-05,
"loss": 1.7025,
"num_input_tokens_seen": 2264924160,
"step": 270
},
{
"epoch": 0.1355,
"grad_norm": 1.421626329421997,
"learning_rate": 3.9227637478283725e-05,
"loss": 1.7574,
"num_input_tokens_seen": 2273312768,
"step": 271
},
{
"epoch": 0.136,
"grad_norm": 1.3830302953720093,
"learning_rate": 3.9205873713538864e-05,
"loss": 1.7882,
"num_input_tokens_seen": 2281701376,
"step": 272
},
{
"epoch": 0.1365,
"grad_norm": 1.0225855112075806,
"learning_rate": 3.918381377046255e-05,
"loss": 1.5312,
"num_input_tokens_seen": 2290089984,
"step": 273
},
{
"epoch": 0.137,
"grad_norm": 1.8110359907150269,
"learning_rate": 3.916145798924639e-05,
"loss": 1.7218,
"num_input_tokens_seen": 2298478592,
"step": 274
},
{
"epoch": 0.1375,
"grad_norm": 1.0985193252563477,
"learning_rate": 3.913880671464418e-05,
"loss": 1.6563,
"num_input_tokens_seen": 2306867200,
"step": 275
},
{
"epoch": 0.138,
"grad_norm": 1.680709719657898,
"learning_rate": 3.911586029596661e-05,
"loss": 1.6444,
"num_input_tokens_seen": 2315255808,
"step": 276
},
{
"epoch": 0.1385,
"grad_norm": 1.201578974723816,
"learning_rate": 3.9092619087075825e-05,
"loss": 1.6551,
"num_input_tokens_seen": 2323644416,
"step": 277
},
{
"epoch": 0.139,
"grad_norm": 1.4566112756729126,
"learning_rate": 3.906908344638002e-05,
"loss": 1.6357,
"num_input_tokens_seen": 2332033024,
"step": 278
},
{
"epoch": 0.1395,
"grad_norm": 1.2596595287322998,
"learning_rate": 3.904525373682791e-05,
"loss": 1.7801,
"num_input_tokens_seen": 2340421632,
"step": 279
},
{
"epoch": 0.14,
"grad_norm": 1.625712513923645,
"learning_rate": 3.9021130325903076e-05,
"loss": 1.7944,
"num_input_tokens_seen": 2348810240,
"step": 280
},
{
"epoch": 0.1405,
"grad_norm": 1.0394383668899536,
"learning_rate": 3.8996713585618354e-05,
"loss": 1.7922,
"num_input_tokens_seen": 2357198848,
"step": 281
},
{
"epoch": 0.141,
"grad_norm": 1.8562290668487549,
"learning_rate": 3.897200389251009e-05,
"loss": 1.7078,
"num_input_tokens_seen": 2365587456,
"step": 282
},
{
"epoch": 0.1415,
"grad_norm": 1.245281457901001,
"learning_rate": 3.8947001627632326e-05,
"loss": 1.6801,
"num_input_tokens_seen": 2373976064,
"step": 283
},
{
"epoch": 0.142,
"grad_norm": 1.5256640911102295,
"learning_rate": 3.892170717655091e-05,
"loss": 1.7155,
"num_input_tokens_seen": 2382364672,
"step": 284
},
{
"epoch": 0.1425,
"grad_norm": 1.0656960010528564,
"learning_rate": 3.889612092933756e-05,
"loss": 1.75,
"num_input_tokens_seen": 2390753280,
"step": 285
},
{
"epoch": 0.143,
"grad_norm": 1.204822063446045,
"learning_rate": 3.887024328056387e-05,
"loss": 1.7464,
"num_input_tokens_seen": 2399141888,
"step": 286
},
{
"epoch": 0.1435,
"grad_norm": 1.1441707611083984,
"learning_rate": 3.88440746292952e-05,
"loss": 1.6608,
"num_input_tokens_seen": 2407530496,
"step": 287
},
{
"epoch": 0.144,
"grad_norm": 1.2725075483322144,
"learning_rate": 3.8817615379084514e-05,
"loss": 1.5159,
"num_input_tokens_seen": 2415919104,
"step": 288
},
{
"epoch": 0.1445,
"grad_norm": 1.1600598096847534,
"learning_rate": 3.879086593796618e-05,
"loss": 1.5823,
"num_input_tokens_seen": 2424307712,
"step": 289
},
{
"epoch": 0.145,
"grad_norm": 1.1082223653793335,
"learning_rate": 3.876382671844969e-05,
"loss": 1.6134,
"num_input_tokens_seen": 2432696320,
"step": 290
},
{
"epoch": 0.1455,
"grad_norm": 1.1749600172042847,
"learning_rate": 3.873649813751323e-05,
"loss": 1.5625,
"num_input_tokens_seen": 2441084928,
"step": 291
},
{
"epoch": 0.146,
"grad_norm": 1.005366325378418,
"learning_rate": 3.870888061659735e-05,
"loss": 1.6975,
"num_input_tokens_seen": 2449473536,
"step": 292
},
{
"epoch": 0.1465,
"grad_norm": 0.8248304724693298,
"learning_rate": 3.8680974581598375e-05,
"loss": 1.607,
"num_input_tokens_seen": 2457862144,
"step": 293
},
{
"epoch": 0.147,
"grad_norm": 0.8967552185058594,
"learning_rate": 3.865278046286189e-05,
"loss": 1.3385,
"num_input_tokens_seen": 2466250752,
"step": 294
},
{
"epoch": 0.1475,
"grad_norm": 1.328721046447754,
"learning_rate": 3.862429869517607e-05,
"loss": 1.6221,
"num_input_tokens_seen": 2474639360,
"step": 295
},
{
"epoch": 0.148,
"grad_norm": 0.917224645614624,
"learning_rate": 3.859552971776503e-05,
"loss": 1.6286,
"num_input_tokens_seen": 2483027968,
"step": 296
},
{
"epoch": 0.1485,
"grad_norm": 0.88497394323349,
"learning_rate": 3.856647397428198e-05,
"loss": 1.6966,
"num_input_tokens_seen": 2491416576,
"step": 297
},
{
"epoch": 0.149,
"grad_norm": 1.0170094966888428,
"learning_rate": 3.853713191280242e-05,
"loss": 1.6923,
"num_input_tokens_seen": 2499805184,
"step": 298
},
{
"epoch": 0.1495,
"grad_norm": 1.137525200843811,
"learning_rate": 3.850750398581725e-05,
"loss": 1.7734,
"num_input_tokens_seen": 2508193792,
"step": 299
},
{
"epoch": 0.15,
"grad_norm": 1.124685287475586,
"learning_rate": 3.8477590650225735e-05,
"loss": 1.6916,
"num_input_tokens_seen": 2516582400,
"step": 300
},
{
"epoch": 0.1505,
"grad_norm": 0.8273991942405701,
"learning_rate": 3.8447392367328535e-05,
"loss": 1.7422,
"num_input_tokens_seen": 2524971008,
"step": 301
},
{
"epoch": 0.151,
"grad_norm": 0.7442718744277954,
"learning_rate": 3.8416909602820534e-05,
"loss": 1.7876,
"num_input_tokens_seen": 2533359616,
"step": 302
},
{
"epoch": 0.1515,
"grad_norm": 1.1291152238845825,
"learning_rate": 3.8386142826783645e-05,
"loss": 1.7041,
"num_input_tokens_seen": 2541748224,
"step": 303
},
{
"epoch": 0.152,
"grad_norm": 0.8179329037666321,
"learning_rate": 3.835509251367963e-05,
"loss": 1.6661,
"num_input_tokens_seen": 2550136832,
"step": 304
},
{
"epoch": 0.1525,
"grad_norm": 0.7719331383705139,
"learning_rate": 3.832375914234272e-05,
"loss": 1.6901,
"num_input_tokens_seen": 2558525440,
"step": 305
},
{
"epoch": 0.153,
"grad_norm": 0.8355837464332581,
"learning_rate": 3.829214319597228e-05,
"loss": 1.6282,
"num_input_tokens_seen": 2566914048,
"step": 306
},
{
"epoch": 0.1535,
"grad_norm": 1.228195071220398,
"learning_rate": 3.826024516212529e-05,
"loss": 1.5801,
"num_input_tokens_seen": 2575302656,
"step": 307
},
{
"epoch": 0.154,
"grad_norm": 1.644123911857605,
"learning_rate": 3.8228065532708905e-05,
"loss": 1.626,
"num_input_tokens_seen": 2583691264,
"step": 308
},
{
"epoch": 0.1545,
"grad_norm": 0.9140821695327759,
"learning_rate": 3.819560480397282e-05,
"loss": 1.671,
"num_input_tokens_seen": 2592079872,
"step": 309
},
{
"epoch": 0.155,
"grad_norm": 1.3930050134658813,
"learning_rate": 3.816286347650163e-05,
"loss": 1.5599,
"num_input_tokens_seen": 2600468480,
"step": 310
},
{
"epoch": 0.1555,
"grad_norm": 1.069976568222046,
"learning_rate": 3.81298420552071e-05,
"loss": 1.6816,
"num_input_tokens_seen": 2608857088,
"step": 311
},
{
"epoch": 0.156,
"grad_norm": 1.2245848178863525,
"learning_rate": 3.809654104932039e-05,
"loss": 1.4973,
"num_input_tokens_seen": 2617245696,
"step": 312
},
{
"epoch": 0.1565,
"grad_norm": 1.045593500137329,
"learning_rate": 3.8062960972384223e-05,
"loss": 1.6431,
"num_input_tokens_seen": 2625634304,
"step": 313
},
{
"epoch": 0.157,
"grad_norm": 1.0730377435684204,
"learning_rate": 3.802910234224491e-05,
"loss": 1.8426,
"num_input_tokens_seen": 2634022912,
"step": 314
},
{
"epoch": 0.1575,
"grad_norm": 1.0670175552368164,
"learning_rate": 3.7994965681044436e-05,
"loss": 1.6098,
"num_input_tokens_seen": 2642411520,
"step": 315
},
{
"epoch": 0.158,
"grad_norm": 1.2215880155563354,
"learning_rate": 3.796055151521231e-05,
"loss": 1.6359,
"num_input_tokens_seen": 2650800128,
"step": 316
},
{
"epoch": 0.1585,
"grad_norm": 0.8806067705154419,
"learning_rate": 3.792586037545758e-05,
"loss": 1.5976,
"num_input_tokens_seen": 2659188736,
"step": 317
},
{
"epoch": 0.159,
"grad_norm": 0.8471524119377136,
"learning_rate": 3.78908927967605e-05,
"loss": 1.6655,
"num_input_tokens_seen": 2667577344,
"step": 318
},
{
"epoch": 0.1595,
"grad_norm": 0.8181973099708557,
"learning_rate": 3.785564931836442e-05,
"loss": 1.6711,
"num_input_tokens_seen": 2675965952,
"step": 319
},
{
"epoch": 0.16,
"grad_norm": 0.8937894105911255,
"learning_rate": 3.782013048376736e-05,
"loss": 1.5473,
"num_input_tokens_seen": 2684354560,
"step": 320
},
{
"epoch": 0.1605,
"grad_norm": 1.0847939252853394,
"learning_rate": 3.778433684071369e-05,
"loss": 1.6359,
"num_input_tokens_seen": 2692743168,
"step": 321
},
{
"epoch": 0.161,
"grad_norm": 1.1049737930297852,
"learning_rate": 3.774826894118567e-05,
"loss": 1.7373,
"num_input_tokens_seen": 2701131776,
"step": 322
},
{
"epoch": 0.1615,
"grad_norm": 0.8523493409156799,
"learning_rate": 3.7711927341394916e-05,
"loss": 1.5551,
"num_input_tokens_seen": 2709520384,
"step": 323
},
{
"epoch": 0.162,
"grad_norm": 0.9004189372062683,
"learning_rate": 3.7675312601773874e-05,
"loss": 1.5898,
"num_input_tokens_seen": 2717908992,
"step": 324
},
{
"epoch": 0.1625,
"grad_norm": 1.4729281663894653,
"learning_rate": 3.76384252869671e-05,
"loss": 1.6389,
"num_input_tokens_seen": 2726297600,
"step": 325
},
{
"epoch": 0.163,
"grad_norm": 0.9593777656555176,
"learning_rate": 3.760126596582264e-05,
"loss": 1.674,
"num_input_tokens_seen": 2734686208,
"step": 326
},
{
"epoch": 0.1635,
"grad_norm": 0.8906177878379822,
"learning_rate": 3.756383521138319e-05,
"loss": 1.5994,
"num_input_tokens_seen": 2743074816,
"step": 327
},
{
"epoch": 0.164,
"grad_norm": 0.7630260586738586,
"learning_rate": 3.7526133600877275e-05,
"loss": 1.6662,
"num_input_tokens_seen": 2751463424,
"step": 328
},
{
"epoch": 0.1645,
"grad_norm": 0.837302029132843,
"learning_rate": 3.748816171571038e-05,
"loss": 1.739,
"num_input_tokens_seen": 2759852032,
"step": 329
},
{
"epoch": 0.165,
"grad_norm": 0.9471412897109985,
"learning_rate": 3.744992014145595e-05,
"loss": 1.6867,
"num_input_tokens_seen": 2768240640,
"step": 330
},
{
"epoch": 0.1655,
"grad_norm": 0.7041006684303284,
"learning_rate": 3.741140946784635e-05,
"loss": 1.7248,
"num_input_tokens_seen": 2776629248,
"step": 331
},
{
"epoch": 0.166,
"grad_norm": 0.7114289999008179,
"learning_rate": 3.737263028876383e-05,
"loss": 1.5945,
"num_input_tokens_seen": 2785017856,
"step": 332
},
{
"epoch": 0.1665,
"grad_norm": 0.7291207313537598,
"learning_rate": 3.733358320223128e-05,
"loss": 1.4206,
"num_input_tokens_seen": 2793406464,
"step": 333
},
{
"epoch": 0.167,
"grad_norm": 0.7624229192733765,
"learning_rate": 3.729426881040311e-05,
"loss": 1.587,
"num_input_tokens_seen": 2801795072,
"step": 334
},
{
"epoch": 0.1675,
"grad_norm": 0.6349395513534546,
"learning_rate": 3.725468771955584e-05,
"loss": 1.8279,
"num_input_tokens_seen": 2810183680,
"step": 335
},
{
"epoch": 0.168,
"grad_norm": 0.6584758162498474,
"learning_rate": 3.721484054007888e-05,
"loss": 1.5794,
"num_input_tokens_seen": 2818572288,
"step": 336
},
{
"epoch": 0.1685,
"grad_norm": 0.8144023418426514,
"learning_rate": 3.717472788646501e-05,
"loss": 1.7197,
"num_input_tokens_seen": 2826960896,
"step": 337
},
{
"epoch": 0.169,
"grad_norm": 0.9008212685585022,
"learning_rate": 3.7134350377301e-05,
"loss": 1.5056,
"num_input_tokens_seen": 2835349504,
"step": 338
},
{
"epoch": 0.1695,
"grad_norm": 1.026350498199463,
"learning_rate": 3.709370863525796e-05,
"loss": 1.6921,
"num_input_tokens_seen": 2843738112,
"step": 339
},
{
"epoch": 0.17,
"grad_norm": 1.0004777908325195,
"learning_rate": 3.705280328708185e-05,
"loss": 1.6245,
"num_input_tokens_seen": 2852126720,
"step": 340
},
{
"epoch": 0.1705,
"grad_norm": 1.0942559242248535,
"learning_rate": 3.701163496358373e-05,
"loss": 1.5393,
"num_input_tokens_seen": 2860515328,
"step": 341
},
{
"epoch": 0.171,
"grad_norm": 0.7471975684165955,
"learning_rate": 3.6970204299630077e-05,
"loss": 1.7802,
"num_input_tokens_seen": 2868903936,
"step": 342
},
{
"epoch": 0.1715,
"grad_norm": 0.9047533869743347,
"learning_rate": 3.692851193413299e-05,
"loss": 1.5245,
"num_input_tokens_seen": 2877292544,
"step": 343
},
{
"epoch": 0.172,
"grad_norm": 1.4945957660675049,
"learning_rate": 3.6886558510040305e-05,
"loss": 1.6707,
"num_input_tokens_seen": 2885681152,
"step": 344
},
{
"epoch": 0.1725,
"grad_norm": 0.808014452457428,
"learning_rate": 3.684434467432573e-05,
"loss": 1.7578,
"num_input_tokens_seen": 2894069760,
"step": 345
},
{
"epoch": 0.173,
"grad_norm": 0.9657407402992249,
"learning_rate": 3.680187107797884e-05,
"loss": 1.6556,
"num_input_tokens_seen": 2902458368,
"step": 346
},
{
"epoch": 0.1735,
"grad_norm": 1.2704793214797974,
"learning_rate": 3.675913837599503e-05,
"loss": 1.5934,
"num_input_tokens_seen": 2910846976,
"step": 347
},
{
"epoch": 0.174,
"grad_norm": 0.90053790807724,
"learning_rate": 3.671614722736541e-05,
"loss": 1.7298,
"num_input_tokens_seen": 2919235584,
"step": 348
},
{
"epoch": 0.1745,
"grad_norm": 0.810030460357666,
"learning_rate": 3.667289829506669e-05,
"loss": 1.5586,
"num_input_tokens_seen": 2927624192,
"step": 349
},
{
"epoch": 0.175,
"grad_norm": 1.1691352128982544,
"learning_rate": 3.662939224605091e-05,
"loss": 1.6294,
"num_input_tokens_seen": 2936012800,
"step": 350
},
{
"epoch": 0.1755,
"grad_norm": 1.1451619863510132,
"learning_rate": 3.658562975123516e-05,
"loss": 1.4937,
"num_input_tokens_seen": 2944401408,
"step": 351
},
{
"epoch": 0.176,
"grad_norm": 0.9522868990898132,
"learning_rate": 3.654161148549124e-05,
"loss": 1.7785,
"num_input_tokens_seen": 2952790016,
"step": 352
},
{
"epoch": 0.1765,
"grad_norm": 0.7397971153259277,
"learning_rate": 3.649733812763527e-05,
"loss": 1.713,
"num_input_tokens_seen": 2961178624,
"step": 353
},
{
"epoch": 0.177,
"grad_norm": 0.7993081212043762,
"learning_rate": 3.64528103604172e-05,
"loss": 1.5611,
"num_input_tokens_seen": 2969567232,
"step": 354
},
{
"epoch": 0.1775,
"grad_norm": 0.8007155060768127,
"learning_rate": 3.640802887051027e-05,
"loss": 1.5691,
"num_input_tokens_seen": 2977955840,
"step": 355
},
{
"epoch": 0.178,
"grad_norm": 0.8291440606117249,
"learning_rate": 3.636299434850047e-05,
"loss": 1.4725,
"num_input_tokens_seen": 2986344448,
"step": 356
},
{
"epoch": 0.1785,
"grad_norm": 1.0753155946731567,
"learning_rate": 3.631770748887583e-05,
"loss": 1.5618,
"num_input_tokens_seen": 2994733056,
"step": 357
},
{
"epoch": 0.179,
"grad_norm": 0.8969347476959229,
"learning_rate": 3.627216899001575e-05,
"loss": 1.6699,
"num_input_tokens_seen": 3003121664,
"step": 358
},
{
"epoch": 0.1795,
"grad_norm": 0.8943629264831543,
"learning_rate": 3.62263795541802e-05,
"loss": 1.7087,
"num_input_tokens_seen": 3011510272,
"step": 359
},
{
"epoch": 0.18,
"grad_norm": 0.9483537077903748,
"learning_rate": 3.6180339887498953e-05,
"loss": 1.6257,
"num_input_tokens_seen": 3019898880,
"step": 360
},
{
"epoch": 0.1805,
"grad_norm": 0.8657635450363159,
"learning_rate": 3.6134050699960604e-05,
"loss": 1.689,
"num_input_tokens_seen": 3028287488,
"step": 361
},
{
"epoch": 0.181,
"grad_norm": 0.7815025448799133,
"learning_rate": 3.608751270540169e-05,
"loss": 1.6175,
"num_input_tokens_seen": 3036676096,
"step": 362
},
{
"epoch": 0.1815,
"grad_norm": 0.7870849967002869,
"learning_rate": 3.604072662149567e-05,
"loss": 1.5586,
"num_input_tokens_seen": 3045064704,
"step": 363
},
{
"epoch": 0.182,
"grad_norm": 0.8339277505874634,
"learning_rate": 3.599369316974182e-05,
"loss": 1.5271,
"num_input_tokens_seen": 3053453312,
"step": 364
},
{
"epoch": 0.1825,
"grad_norm": 0.9285105466842651,
"learning_rate": 3.594641307545414e-05,
"loss": 1.7041,
"num_input_tokens_seen": 3061841920,
"step": 365
},
{
"epoch": 0.183,
"grad_norm": 0.8689367175102234,
"learning_rate": 3.58988870677502e-05,
"loss": 1.4738,
"num_input_tokens_seen": 3070230528,
"step": 366
},
{
"epoch": 0.1835,
"grad_norm": 0.929764986038208,
"learning_rate": 3.585111587953982e-05,
"loss": 1.7297,
"num_input_tokens_seen": 3078619136,
"step": 367
},
{
"epoch": 0.184,
"grad_norm": 1.05793035030365,
"learning_rate": 3.580310024751381e-05,
"loss": 1.5431,
"num_input_tokens_seen": 3087007744,
"step": 368
},
{
"epoch": 0.1845,
"grad_norm": 0.8866976499557495,
"learning_rate": 3.575484091213262e-05,
"loss": 1.6682,
"num_input_tokens_seen": 3095396352,
"step": 369
},
{
"epoch": 0.185,
"grad_norm": 0.7259969115257263,
"learning_rate": 3.57063386176149e-05,
"loss": 1.6123,
"num_input_tokens_seen": 3103784960,
"step": 370
},
{
"epoch": 0.1855,
"grad_norm": 0.9831200838088989,
"learning_rate": 3.565759411192604e-05,
"loss": 1.5845,
"num_input_tokens_seen": 3112173568,
"step": 371
},
{
"epoch": 0.186,
"grad_norm": 0.9731907248497009,
"learning_rate": 3.5608608146766597e-05,
"loss": 1.7179,
"num_input_tokens_seen": 3120562176,
"step": 372
},
{
"epoch": 0.1865,
"grad_norm": 1.0148561000823975,
"learning_rate": 3.555938147756077e-05,
"loss": 1.9192,
"num_input_tokens_seen": 3128950784,
"step": 373
},
{
"epoch": 0.187,
"grad_norm": 0.9084612131118774,
"learning_rate": 3.5509914863444694e-05,
"loss": 1.5912,
"num_input_tokens_seen": 3137339392,
"step": 374
},
{
"epoch": 0.1875,
"grad_norm": 0.9402855038642883,
"learning_rate": 3.546020906725474e-05,
"loss": 1.5803,
"num_input_tokens_seen": 3145728000,
"step": 375
},
{
"epoch": 0.188,
"grad_norm": 0.9808095693588257,
"learning_rate": 3.541026485551579e-05,
"loss": 1.8292,
"num_input_tokens_seen": 3154116608,
"step": 376
},
{
"epoch": 0.1885,
"grad_norm": 1.1110026836395264,
"learning_rate": 3.536008299842936e-05,
"loss": 1.6698,
"num_input_tokens_seen": 3162505216,
"step": 377
},
{
"epoch": 0.189,
"grad_norm": 1.0544601678848267,
"learning_rate": 3.530966426986177e-05,
"loss": 1.6978,
"num_input_tokens_seen": 3170893824,
"step": 378
},
{
"epoch": 0.1895,
"grad_norm": 0.7350985407829285,
"learning_rate": 3.525900944733218e-05,
"loss": 1.6923,
"num_input_tokens_seen": 3179282432,
"step": 379
},
{
"epoch": 0.19,
"grad_norm": 0.9171492457389832,
"learning_rate": 3.520811931200063e-05,
"loss": 1.6079,
"num_input_tokens_seen": 3187671040,
"step": 380
},
{
"epoch": 0.1905,
"grad_norm": 1.1049323081970215,
"learning_rate": 3.515699464865594e-05,
"loss": 1.6878,
"num_input_tokens_seen": 3196059648,
"step": 381
},
{
"epoch": 0.191,
"grad_norm": 0.776717483997345,
"learning_rate": 3.5105636245703675e-05,
"loss": 1.5757,
"num_input_tokens_seen": 3204448256,
"step": 382
},
{
"epoch": 0.1915,
"grad_norm": 0.8234217166900635,
"learning_rate": 3.505404489515394e-05,
"loss": 1.6609,
"num_input_tokens_seen": 3212836864,
"step": 383
},
{
"epoch": 0.192,
"grad_norm": 0.854977011680603,
"learning_rate": 3.5002221392609196e-05,
"loss": 1.6726,
"num_input_tokens_seen": 3221225472,
"step": 384
},
{
"epoch": 0.1925,
"grad_norm": 0.6885860562324524,
"learning_rate": 3.495016653725194e-05,
"loss": 1.7057,
"num_input_tokens_seen": 3229614080,
"step": 385
},
{
"epoch": 0.193,
"grad_norm": 0.8042221665382385,
"learning_rate": 3.489788113183244e-05,
"loss": 1.6693,
"num_input_tokens_seen": 3238002688,
"step": 386
},
{
"epoch": 0.1935,
"grad_norm": 0.8148207664489746,
"learning_rate": 3.484536598265634e-05,
"loss": 1.5554,
"num_input_tokens_seen": 3246391296,
"step": 387
},
{
"epoch": 0.194,
"grad_norm": 0.8488610982894897,
"learning_rate": 3.47926218995722e-05,
"loss": 1.5941,
"num_input_tokens_seen": 3254779904,
"step": 388
},
{
"epoch": 0.1945,
"grad_norm": 0.7625729441642761,
"learning_rate": 3.473964969595902e-05,
"loss": 1.5805,
"num_input_tokens_seen": 3263168512,
"step": 389
},
{
"epoch": 0.195,
"grad_norm": 0.8852419853210449,
"learning_rate": 3.468645018871371e-05,
"loss": 1.6924,
"num_input_tokens_seen": 3271557120,
"step": 390
},
{
"epoch": 0.1955,
"grad_norm": 0.7517937421798706,
"learning_rate": 3.46330241982385e-05,
"loss": 1.6021,
"num_input_tokens_seen": 3279945728,
"step": 391
},
{
"epoch": 0.196,
"grad_norm": 0.676200807094574,
"learning_rate": 3.457937254842823e-05,
"loss": 1.738,
"num_input_tokens_seen": 3288334336,
"step": 392
},
{
"epoch": 0.1965,
"grad_norm": 0.9149476289749146,
"learning_rate": 3.4525496066657735e-05,
"loss": 1.7369,
"num_input_tokens_seen": 3296722944,
"step": 393
},
{
"epoch": 0.197,
"grad_norm": 0.8072534203529358,
"learning_rate": 3.4471395583768985e-05,
"loss": 1.6982,
"num_input_tokens_seen": 3305111552,
"step": 394
},
{
"epoch": 0.1975,
"grad_norm": 0.7343615889549255,
"learning_rate": 3.441707193405838e-05,
"loss": 1.5764,
"num_input_tokens_seen": 3313500160,
"step": 395
},
{
"epoch": 0.198,
"grad_norm": 1.0707248449325562,
"learning_rate": 3.436252595526378e-05,
"loss": 1.5652,
"num_input_tokens_seen": 3321888768,
"step": 396
},
{
"epoch": 0.1985,
"grad_norm": 1.0053772926330566,
"learning_rate": 3.430775848855166e-05,
"loss": 1.6278,
"num_input_tokens_seen": 3330277376,
"step": 397
},
{
"epoch": 0.199,
"grad_norm": 1.151880145072937,
"learning_rate": 3.425277037850411e-05,
"loss": 1.7023,
"num_input_tokens_seen": 3338665984,
"step": 398
},
{
"epoch": 0.1995,
"grad_norm": 0.7218210697174072,
"learning_rate": 3.419756247310581e-05,
"loss": 1.5999,
"num_input_tokens_seen": 3347054592,
"step": 399
},
{
"epoch": 0.2,
"grad_norm": 1.1546841859817505,
"learning_rate": 3.4142135623730954e-05,
"loss": 1.6385,
"num_input_tokens_seen": 3355443200,
"step": 400
},
{
"epoch": 0.2005,
"grad_norm": 0.7345328330993652,
"learning_rate": 3.408649068513013e-05,
"loss": 1.7926,
"num_input_tokens_seen": 3363831808,
"step": 401
},
{
"epoch": 0.201,
"grad_norm": 0.7448058724403381,
"learning_rate": 3.403062851541712e-05,
"loss": 1.698,
"num_input_tokens_seen": 3372220416,
"step": 402
},
{
"epoch": 0.2015,
"grad_norm": 0.912766695022583,
"learning_rate": 3.397454997605569e-05,
"loss": 1.6496,
"num_input_tokens_seen": 3380609024,
"step": 403
},
{
"epoch": 0.202,
"grad_norm": 0.7980219721794128,
"learning_rate": 3.391825593184629e-05,
"loss": 1.6537,
"num_input_tokens_seen": 3388997632,
"step": 404
},
{
"epoch": 0.2025,
"grad_norm": 0.7381040453910828,
"learning_rate": 3.3861747250912724e-05,
"loss": 1.6003,
"num_input_tokens_seen": 3397386240,
"step": 405
},
{
"epoch": 0.203,
"grad_norm": 0.7264212369918823,
"learning_rate": 3.3805024804688745e-05,
"loss": 1.4873,
"num_input_tokens_seen": 3405774848,
"step": 406
},
{
"epoch": 0.2035,
"grad_norm": 0.7035699486732483,
"learning_rate": 3.374808946790466e-05,
"loss": 1.7535,
"num_input_tokens_seen": 3414163456,
"step": 407
},
{
"epoch": 0.204,
"grad_norm": 0.7741231918334961,
"learning_rate": 3.369094211857378e-05,
"loss": 1.6001,
"num_input_tokens_seen": 3422552064,
"step": 408
},
{
"epoch": 0.2045,
"grad_norm": 0.9337701201438904,
"learning_rate": 3.363358363797893e-05,
"loss": 1.7443,
"num_input_tokens_seen": 3430940672,
"step": 409
},
{
"epoch": 0.205,
"grad_norm": 0.7415249347686768,
"learning_rate": 3.357601491065884e-05,
"loss": 1.5493,
"num_input_tokens_seen": 3439329280,
"step": 410
},
{
"epoch": 0.2055,
"grad_norm": 0.9262260794639587,
"learning_rate": 3.35182368243945e-05,
"loss": 1.5924,
"num_input_tokens_seen": 3447717888,
"step": 411
},
{
"epoch": 0.206,
"grad_norm": 0.9444292783737183,
"learning_rate": 3.346025027019547e-05,
"loss": 1.5166,
"num_input_tokens_seen": 3456106496,
"step": 412
},
{
"epoch": 0.2065,
"grad_norm": 0.9404441118240356,
"learning_rate": 3.3402056142286156e-05,
"loss": 1.4745,
"num_input_tokens_seen": 3464495104,
"step": 413
},
{
"epoch": 0.207,
"grad_norm": 0.9392333030700684,
"learning_rate": 3.3343655338091996e-05,
"loss": 1.7033,
"num_input_tokens_seen": 3472883712,
"step": 414
},
{
"epoch": 0.2075,
"grad_norm": 0.8478058576583862,
"learning_rate": 3.328504875822564e-05,
"loss": 1.5295,
"num_input_tokens_seen": 3481272320,
"step": 415
},
{
"epoch": 0.208,
"grad_norm": 1.0962209701538086,
"learning_rate": 3.322623730647304e-05,
"loss": 1.485,
"num_input_tokens_seen": 3489660928,
"step": 416
},
{
"epoch": 0.2085,
"grad_norm": 0.8134574890136719,
"learning_rate": 3.316722188977955e-05,
"loss": 1.6648,
"num_input_tokens_seen": 3498049536,
"step": 417
},
{
"epoch": 0.209,
"grad_norm": 0.7560186982154846,
"learning_rate": 3.310800341823588e-05,
"loss": 1.7324,
"num_input_tokens_seen": 3506438144,
"step": 418
},
{
"epoch": 0.2095,
"grad_norm": 0.8222687244415283,
"learning_rate": 3.3048582805064137e-05,
"loss": 1.7548,
"num_input_tokens_seen": 3514826752,
"step": 419
},
{
"epoch": 0.21,
"grad_norm": 0.6891204714775085,
"learning_rate": 3.298896096660367e-05,
"loss": 1.6091,
"num_input_tokens_seen": 3523215360,
"step": 420
},
{
"epoch": 0.2105,
"grad_norm": 0.8553364276885986,
"learning_rate": 3.2929138822297004e-05,
"loss": 1.6255,
"num_input_tokens_seen": 3531603968,
"step": 421
},
{
"epoch": 0.211,
"grad_norm": 0.6773521900177002,
"learning_rate": 3.286911729467558e-05,
"loss": 1.6468,
"num_input_tokens_seen": 3539992576,
"step": 422
},
{
"epoch": 0.2115,
"grad_norm": 0.6326610445976257,
"learning_rate": 3.280889730934562e-05,
"loss": 1.5726,
"num_input_tokens_seen": 3548381184,
"step": 423
},
{
"epoch": 0.212,
"grad_norm": 0.6619254946708679,
"learning_rate": 3.27484797949738e-05,
"loss": 1.7451,
"num_input_tokens_seen": 3556769792,
"step": 424
},
{
"epoch": 0.2125,
"grad_norm": 0.6987465620040894,
"learning_rate": 3.268786568327291e-05,
"loss": 1.5267,
"num_input_tokens_seen": 3565158400,
"step": 425
},
{
"epoch": 0.213,
"grad_norm": 1.052083969116211,
"learning_rate": 3.262705590898756e-05,
"loss": 1.5756,
"num_input_tokens_seen": 3573547008,
"step": 426
},
{
"epoch": 0.2135,
"grad_norm": 1.3585137128829956,
"learning_rate": 3.2566051409879676e-05,
"loss": 1.6222,
"num_input_tokens_seen": 3581935616,
"step": 427
},
{
"epoch": 0.214,
"grad_norm": 0.7190059423446655,
"learning_rate": 3.250485312671411e-05,
"loss": 1.5788,
"num_input_tokens_seen": 3590324224,
"step": 428
},
{
"epoch": 0.2145,
"grad_norm": 1.0504647493362427,
"learning_rate": 3.244346200324409e-05,
"loss": 1.7213,
"num_input_tokens_seen": 3598712832,
"step": 429
},
{
"epoch": 0.215,
"grad_norm": 1.4357564449310303,
"learning_rate": 3.238187898619669e-05,
"loss": 1.9118,
"num_input_tokens_seen": 3607101440,
"step": 430
},
{
"epoch": 0.2155,
"grad_norm": 0.7912888526916504,
"learning_rate": 3.23201050252582e-05,
"loss": 1.5837,
"num_input_tokens_seen": 3615490048,
"step": 431
},
{
"epoch": 0.216,
"grad_norm": 1.4020187854766846,
"learning_rate": 3.2258141073059533e-05,
"loss": 1.602,
"num_input_tokens_seen": 3623878656,
"step": 432
},
{
"epoch": 0.2165,
"grad_norm": 0.8430026173591614,
"learning_rate": 3.219598808516148e-05,
"loss": 1.5711,
"num_input_tokens_seen": 3632267264,
"step": 433
},
{
"epoch": 0.217,
"grad_norm": 1.3351163864135742,
"learning_rate": 3.2133647020039995e-05,
"loss": 1.64,
"num_input_tokens_seen": 3640655872,
"step": 434
},
{
"epoch": 0.2175,
"grad_norm": 0.8527347445487976,
"learning_rate": 3.207111883907143e-05,
"loss": 1.6443,
"num_input_tokens_seen": 3649044480,
"step": 435
},
{
"epoch": 0.218,
"grad_norm": 1.2249008417129517,
"learning_rate": 3.200840450651769e-05,
"loss": 1.7433,
"num_input_tokens_seen": 3657433088,
"step": 436
},
{
"epoch": 0.2185,
"grad_norm": 0.8667119741439819,
"learning_rate": 3.194550498951134e-05,
"loss": 1.6459,
"num_input_tokens_seen": 3665821696,
"step": 437
},
{
"epoch": 0.219,
"grad_norm": 1.0201963186264038,
"learning_rate": 3.188242125804078e-05,
"loss": 1.5644,
"num_input_tokens_seen": 3674210304,
"step": 438
},
{
"epoch": 0.2195,
"grad_norm": 0.6974561214447021,
"learning_rate": 3.181915428493515e-05,
"loss": 1.6249,
"num_input_tokens_seen": 3682598912,
"step": 439
},
{
"epoch": 0.22,
"grad_norm": 0.920744001865387,
"learning_rate": 3.1755705045849465e-05,
"loss": 1.5955,
"num_input_tokens_seen": 3690987520,
"step": 440
},
{
"epoch": 0.2205,
"grad_norm": 0.7391747832298279,
"learning_rate": 3.1692074519249476e-05,
"loss": 1.5939,
"num_input_tokens_seen": 3699376128,
"step": 441
},
{
"epoch": 0.221,
"grad_norm": 0.7469002604484558,
"learning_rate": 3.1628263686396614e-05,
"loss": 1.9061,
"num_input_tokens_seen": 3707764736,
"step": 442
},
{
"epoch": 0.2215,
"grad_norm": 0.8305338025093079,
"learning_rate": 3.156427353133286e-05,
"loss": 1.7827,
"num_input_tokens_seen": 3716153344,
"step": 443
},
{
"epoch": 0.222,
"grad_norm": 0.666205644607544,
"learning_rate": 3.150010504086558e-05,
"loss": 1.5269,
"num_input_tokens_seen": 3724541952,
"step": 444
},
{
"epoch": 0.2225,
"grad_norm": 0.6806762218475342,
"learning_rate": 3.1435759204552246e-05,
"loss": 1.7997,
"num_input_tokens_seen": 3732930560,
"step": 445
},
{
"epoch": 0.223,
"grad_norm": 0.6571435332298279,
"learning_rate": 3.1371237014685285e-05,
"loss": 1.6215,
"num_input_tokens_seen": 3741319168,
"step": 446
},
{
"epoch": 0.2235,
"grad_norm": 0.685133159160614,
"learning_rate": 3.130653946627666e-05,
"loss": 1.7139,
"num_input_tokens_seen": 3749707776,
"step": 447
},
{
"epoch": 0.224,
"grad_norm": 0.6187507510185242,
"learning_rate": 3.124166755704261e-05,
"loss": 1.6145,
"num_input_tokens_seen": 3758096384,
"step": 448
},
{
"epoch": 0.2245,
"grad_norm": 0.5842682719230652,
"learning_rate": 3.117662228738823e-05,
"loss": 1.6303,
"num_input_tokens_seen": 3766484992,
"step": 449
},
{
"epoch": 0.225,
"grad_norm": 0.626196026802063,
"learning_rate": 3.111140466039205e-05,
"loss": 1.4281,
"num_input_tokens_seen": 3774873600,
"step": 450
},
{
"epoch": 0.2255,
"grad_norm": 0.8410570025444031,
"learning_rate": 3.104601568179054e-05,
"loss": 1.5349,
"num_input_tokens_seen": 3783262208,
"step": 451
},
{
"epoch": 0.226,
"grad_norm": 0.9013755321502686,
"learning_rate": 3.098045635996264e-05,
"loss": 1.6157,
"num_input_tokens_seen": 3791650816,
"step": 452
},
{
"epoch": 0.2265,
"grad_norm": 0.8393816351890564,
"learning_rate": 3.09147277059142e-05,
"loss": 1.5574,
"num_input_tokens_seen": 3800039424,
"step": 453
},
{
"epoch": 0.227,
"grad_norm": 0.8969332575798035,
"learning_rate": 3.084883073326238e-05,
"loss": 1.8073,
"num_input_tokens_seen": 3808428032,
"step": 454
},
{
"epoch": 0.2275,
"grad_norm": 0.9891500473022461,
"learning_rate": 3.078276645822001e-05,
"loss": 1.6999,
"num_input_tokens_seen": 3816816640,
"step": 455
},
{
"epoch": 0.228,
"grad_norm": 0.6412311792373657,
"learning_rate": 3.0716535899579936e-05,
"loss": 1.7065,
"num_input_tokens_seen": 3825205248,
"step": 456
},
{
"epoch": 0.2285,
"grad_norm": 1.021301031112671,
"learning_rate": 3.065014007869931e-05,
"loss": 1.6141,
"num_input_tokens_seen": 3833593856,
"step": 457
},
{
"epoch": 0.229,
"grad_norm": 0.933661162853241,
"learning_rate": 3.058358001948381e-05,
"loss": 1.5318,
"num_input_tokens_seen": 3841982464,
"step": 458
},
{
"epoch": 0.2295,
"grad_norm": 0.7375662922859192,
"learning_rate": 3.0516856748371914e-05,
"loss": 1.6712,
"num_input_tokens_seen": 3850371072,
"step": 459
},
{
"epoch": 0.23,
"grad_norm": 0.7946784496307373,
"learning_rate": 3.0449971294318977e-05,
"loss": 1.6786,
"num_input_tokens_seen": 3858759680,
"step": 460
},
{
"epoch": 0.2305,
"grad_norm": 0.8587263822555542,
"learning_rate": 3.0382924688781462e-05,
"loss": 1.6399,
"num_input_tokens_seen": 3867148288,
"step": 461
},
{
"epoch": 0.231,
"grad_norm": 0.836588442325592,
"learning_rate": 3.031571796570095e-05,
"loss": 1.5581,
"num_input_tokens_seen": 3875536896,
"step": 462
},
{
"epoch": 0.2315,
"grad_norm": 0.70086270570755,
"learning_rate": 3.0248352161488267e-05,
"loss": 1.4948,
"num_input_tokens_seen": 3883925504,
"step": 463
},
{
"epoch": 0.232,
"grad_norm": 0.7697365283966064,
"learning_rate": 3.018082831500743e-05,
"loss": 1.6849,
"num_input_tokens_seen": 3892314112,
"step": 464
},
{
"epoch": 0.2325,
"grad_norm": 0.7808486819267273,
"learning_rate": 3.0113147467559697e-05,
"loss": 1.5426,
"num_input_tokens_seen": 3900702720,
"step": 465
},
{
"epoch": 0.233,
"grad_norm": 0.6648643016815186,
"learning_rate": 3.004531066286745e-05,
"loss": 1.6239,
"num_input_tokens_seen": 3909091328,
"step": 466
},
{
"epoch": 0.2335,
"grad_norm": 0.6827794909477234,
"learning_rate": 2.997731894705815e-05,
"loss": 1.5759,
"num_input_tokens_seen": 3917479936,
"step": 467
},
{
"epoch": 0.234,
"grad_norm": 0.7341679334640503,
"learning_rate": 2.9909173368648154e-05,
"loss": 1.7485,
"num_input_tokens_seen": 3925868544,
"step": 468
},
{
"epoch": 0.2345,
"grad_norm": 0.6696147322654724,
"learning_rate": 2.9840874978526582e-05,
"loss": 1.5589,
"num_input_tokens_seen": 3934257152,
"step": 469
},
{
"epoch": 0.235,
"grad_norm": 0.6791352033615112,
"learning_rate": 2.9772424829939103e-05,
"loss": 1.5359,
"num_input_tokens_seen": 3942645760,
"step": 470
},
{
"epoch": 0.2355,
"grad_norm": 0.5946401357650757,
"learning_rate": 2.9703823978471676e-05,
"loss": 1.6936,
"num_input_tokens_seen": 3951034368,
"step": 471
},
{
"epoch": 0.236,
"grad_norm": 0.5652855038642883,
"learning_rate": 2.9635073482034307e-05,
"loss": 1.5851,
"num_input_tokens_seen": 3959422976,
"step": 472
},
{
"epoch": 0.2365,
"grad_norm": 0.6535623669624329,
"learning_rate": 2.9566174400844692e-05,
"loss": 1.6653,
"num_input_tokens_seen": 3967811584,
"step": 473
},
{
"epoch": 0.237,
"grad_norm": 0.6742349863052368,
"learning_rate": 2.949712779741189e-05,
"loss": 1.5506,
"num_input_tokens_seen": 3976200192,
"step": 474
},
{
"epoch": 0.2375,
"grad_norm": 0.8731011748313904,
"learning_rate": 2.9427934736519962e-05,
"loss": 1.6377,
"num_input_tokens_seen": 3984588800,
"step": 475
},
{
"epoch": 0.238,
"grad_norm": 0.834618866443634,
"learning_rate": 2.935859628521147e-05,
"loss": 1.6262,
"num_input_tokens_seen": 3992977408,
"step": 476
},
{
"epoch": 0.2385,
"grad_norm": 0.7309226393699646,
"learning_rate": 2.9289113512771133e-05,
"loss": 1.6611,
"num_input_tokens_seen": 4001366016,
"step": 477
},
{
"epoch": 0.239,
"grad_norm": 0.5517134070396423,
"learning_rate": 2.921948749070925e-05,
"loss": 1.6789,
"num_input_tokens_seen": 4009754624,
"step": 478
},
{
"epoch": 0.2395,
"grad_norm": 0.6876691579818726,
"learning_rate": 2.914971929274521e-05,
"loss": 1.5779,
"num_input_tokens_seen": 4018143232,
"step": 479
},
{
"epoch": 0.24,
"grad_norm": 0.7181439399719238,
"learning_rate": 2.9079809994790937e-05,
"loss": 1.7478,
"num_input_tokens_seen": 4026531840,
"step": 480
},
{
"epoch": 0.2405,
"grad_norm": 0.546655535697937,
"learning_rate": 2.900976067493429e-05,
"loss": 1.6137,
"num_input_tokens_seen": 4034920448,
"step": 481
},
{
"epoch": 0.241,
"grad_norm": 0.8767627477645874,
"learning_rate": 2.8939572413422426e-05,
"loss": 1.7662,
"num_input_tokens_seen": 4043309056,
"step": 482
},
{
"epoch": 0.2415,
"grad_norm": 1.0640788078308105,
"learning_rate": 2.886924629264517e-05,
"loss": 1.6892,
"num_input_tokens_seen": 4051697664,
"step": 483
},
{
"epoch": 0.242,
"grad_norm": 0.7425519824028015,
"learning_rate": 2.8798783397118305e-05,
"loss": 1.6408,
"num_input_tokens_seen": 4060086272,
"step": 484
},
{
"epoch": 0.2425,
"grad_norm": 0.5955070853233337,
"learning_rate": 2.872818481346684e-05,
"loss": 1.6736,
"num_input_tokens_seen": 4068474880,
"step": 485
},
{
"epoch": 0.243,
"grad_norm": 0.7473011612892151,
"learning_rate": 2.8657451630408287e-05,
"loss": 1.6427,
"num_input_tokens_seen": 4076863488,
"step": 486
},
{
"epoch": 0.2435,
"grad_norm": 0.7602230310440063,
"learning_rate": 2.85865849387358e-05,
"loss": 1.7862,
"num_input_tokens_seen": 4085252096,
"step": 487
},
{
"epoch": 0.244,
"grad_norm": 0.5774588584899902,
"learning_rate": 2.8515585831301456e-05,
"loss": 1.4897,
"num_input_tokens_seen": 4093640704,
"step": 488
},
{
"epoch": 0.2445,
"grad_norm": 1.0716197490692139,
"learning_rate": 2.844445540299931e-05,
"loss": 1.7469,
"num_input_tokens_seen": 4102029312,
"step": 489
},
{
"epoch": 0.245,
"grad_norm": 0.9976931810379028,
"learning_rate": 2.8373194750748566e-05,
"loss": 1.7009,
"num_input_tokens_seen": 4110417920,
"step": 490
},
{
"epoch": 0.2455,
"grad_norm": 0.6268777251243591,
"learning_rate": 2.8301804973476628e-05,
"loss": 1.5147,
"num_input_tokens_seen": 4118806528,
"step": 491
},
{
"epoch": 0.246,
"grad_norm": 0.8679307103157043,
"learning_rate": 2.823028717210218e-05,
"loss": 1.6155,
"num_input_tokens_seen": 4127195136,
"step": 492
},
{
"epoch": 0.2465,
"grad_norm": 0.7973881959915161,
"learning_rate": 2.8158642449518186e-05,
"loss": 1.6257,
"num_input_tokens_seen": 4135583744,
"step": 493
},
{
"epoch": 0.247,
"grad_norm": 0.5766689777374268,
"learning_rate": 2.8086871910574904e-05,
"loss": 1.6149,
"num_input_tokens_seen": 4143972352,
"step": 494
},
{
"epoch": 0.2475,
"grad_norm": 0.6986018419265747,
"learning_rate": 2.8014976662062818e-05,
"loss": 1.571,
"num_input_tokens_seen": 4152360960,
"step": 495
},
{
"epoch": 0.248,
"grad_norm": 0.9588996171951294,
"learning_rate": 2.7942957812695613e-05,
"loss": 1.6737,
"num_input_tokens_seen": 4160749568,
"step": 496
},
{
"epoch": 0.2485,
"grad_norm": 0.5572625398635864,
"learning_rate": 2.787081647309303e-05,
"loss": 1.6634,
"num_input_tokens_seen": 4169138176,
"step": 497
},
{
"epoch": 0.249,
"grad_norm": 0.7389435172080994,
"learning_rate": 2.7798553755763768e-05,
"loss": 1.4613,
"num_input_tokens_seen": 4177526784,
"step": 498
},
{
"epoch": 0.2495,
"grad_norm": 0.7594336867332458,
"learning_rate": 2.7726170775088324e-05,
"loss": 1.7553,
"num_input_tokens_seen": 4185915392,
"step": 499
},
{
"epoch": 0.25,
"grad_norm": 0.5780491232872009,
"learning_rate": 2.7653668647301797e-05,
"loss": 1.674,
"num_input_tokens_seen": 4194304000,
"step": 500
},
{
"epoch": 0.2505,
"grad_norm": 0.7985231876373291,
"learning_rate": 2.7581048490476695e-05,
"loss": 1.6012,
"num_input_tokens_seen": 4202692608,
"step": 501
},
{
"epoch": 0.251,
"grad_norm": 0.6286901831626892,
"learning_rate": 2.7508311424505665e-05,
"loss": 1.8672,
"num_input_tokens_seen": 4211081216,
"step": 502
},
{
"epoch": 0.2515,
"grad_norm": 0.7128638029098511,
"learning_rate": 2.7435458571084247e-05,
"loss": 1.5799,
"num_input_tokens_seen": 4219469824,
"step": 503
},
{
"epoch": 0.252,
"grad_norm": 0.6798509359359741,
"learning_rate": 2.7362491053693564e-05,
"loss": 1.6685,
"num_input_tokens_seen": 4227858432,
"step": 504
},
{
"epoch": 0.2525,
"grad_norm": 0.8275137543678284,
"learning_rate": 2.7289409997583002e-05,
"loss": 1.6989,
"num_input_tokens_seen": 4236247040,
"step": 505
},
{
"epoch": 0.253,
"grad_norm": 0.5991443991661072,
"learning_rate": 2.7216216529752836e-05,
"loss": 1.509,
"num_input_tokens_seen": 4244635648,
"step": 506
},
{
"epoch": 0.2535,
"grad_norm": 0.7469322681427002,
"learning_rate": 2.7142911778936913e-05,
"loss": 1.4821,
"num_input_tokens_seen": 4253024256,
"step": 507
},
{
"epoch": 0.254,
"grad_norm": 0.7251424789428711,
"learning_rate": 2.7069496875585145e-05,
"loss": 1.6557,
"num_input_tokens_seen": 4261412864,
"step": 508
},
{
"epoch": 0.2545,
"grad_norm": 0.5844877362251282,
"learning_rate": 2.6995972951846177e-05,
"loss": 1.6354,
"num_input_tokens_seen": 4269801472,
"step": 509
},
{
"epoch": 0.255,
"grad_norm": 0.6762341260910034,
"learning_rate": 2.692234114154986e-05,
"loss": 1.6662,
"num_input_tokens_seen": 4278190080,
"step": 510
},
{
"epoch": 0.2555,
"grad_norm": 0.6216310262680054,
"learning_rate": 2.68486025801898e-05,
"loss": 1.6086,
"num_input_tokens_seen": 4286578688,
"step": 511
},
{
"epoch": 0.256,
"grad_norm": 0.6491274833679199,
"learning_rate": 2.6774758404905833e-05,
"loss": 1.6203,
"num_input_tokens_seen": 4294967296,
"step": 512
},
{
"epoch": 0.2565,
"grad_norm": 0.5345763564109802,
"learning_rate": 2.670080975446648e-05,
"loss": 1.7299,
"num_input_tokens_seen": 4303355904,
"step": 513
},
{
"epoch": 0.257,
"grad_norm": 0.60438072681427,
"learning_rate": 2.662675776925142e-05,
"loss": 1.5607,
"num_input_tokens_seen": 4311744512,
"step": 514
},
{
"epoch": 0.2575,
"grad_norm": 0.48150795698165894,
"learning_rate": 2.6552603591233875e-05,
"loss": 1.5468,
"num_input_tokens_seen": 4320133120,
"step": 515
},
{
"epoch": 0.258,
"grad_norm": 0.7575556635856628,
"learning_rate": 2.647834836396299e-05,
"loss": 1.5371,
"num_input_tokens_seen": 4328521728,
"step": 516
},
{
"epoch": 0.2585,
"grad_norm": 0.8684462308883667,
"learning_rate": 2.6403993232546235e-05,
"loss": 1.3988,
"num_input_tokens_seen": 4336910336,
"step": 517
},
{
"epoch": 0.259,
"grad_norm": 1.1402355432510376,
"learning_rate": 2.6329539343631725e-05,
"loss": 1.5219,
"num_input_tokens_seen": 4345298944,
"step": 518
},
{
"epoch": 0.2595,
"grad_norm": 0.7717565894126892,
"learning_rate": 2.625498784539052e-05,
"loss": 1.6181,
"num_input_tokens_seen": 4353687552,
"step": 519
},
{
"epoch": 0.26,
"grad_norm": 0.6470482349395752,
"learning_rate": 2.618033988749895e-05,
"loss": 1.8013,
"num_input_tokens_seen": 4362076160,
"step": 520
},
{
"epoch": 0.2605,
"grad_norm": 0.9011819958686829,
"learning_rate": 2.6105596621120873e-05,
"loss": 1.5728,
"num_input_tokens_seen": 4370464768,
"step": 521
},
{
"epoch": 0.261,
"grad_norm": 0.9903879761695862,
"learning_rate": 2.6030759198889915e-05,
"loss": 1.5021,
"num_input_tokens_seen": 4378853376,
"step": 522
},
{
"epoch": 0.2615,
"grad_norm": 0.6056469678878784,
"learning_rate": 2.595582877489171e-05,
"loss": 1.5722,
"num_input_tokens_seen": 4387241984,
"step": 523
},
{
"epoch": 0.262,
"grad_norm": 0.7082294225692749,
"learning_rate": 2.588080650464608e-05,
"loss": 1.5256,
"num_input_tokens_seen": 4395630592,
"step": 524
},
{
"epoch": 0.2625,
"grad_norm": 0.8824757933616638,
"learning_rate": 2.580569354508925e-05,
"loss": 1.6759,
"num_input_tokens_seen": 4404019200,
"step": 525
},
{
"epoch": 0.263,
"grad_norm": 0.6927574276924133,
"learning_rate": 2.573049105455597e-05,
"loss": 1.5,
"num_input_tokens_seen": 4412407808,
"step": 526
},
{
"epoch": 0.2635,
"grad_norm": 0.8164727687835693,
"learning_rate": 2.5655200192761668e-05,
"loss": 1.4772,
"num_input_tokens_seen": 4420796416,
"step": 527
},
{
"epoch": 0.264,
"grad_norm": 0.6587943434715271,
"learning_rate": 2.557982212078459e-05,
"loss": 1.6043,
"num_input_tokens_seen": 4429185024,
"step": 528
},
{
"epoch": 0.2645,
"grad_norm": 0.6168190836906433,
"learning_rate": 2.550435800104783e-05,
"loss": 1.5051,
"num_input_tokens_seen": 4437573632,
"step": 529
},
{
"epoch": 0.265,
"grad_norm": 0.6624361872673035,
"learning_rate": 2.5428808997301486e-05,
"loss": 1.6477,
"num_input_tokens_seen": 4445962240,
"step": 530
},
{
"epoch": 0.2655,
"grad_norm": 0.6016016006469727,
"learning_rate": 2.535317627460465e-05,
"loss": 1.7103,
"num_input_tokens_seen": 4454350848,
"step": 531
},
{
"epoch": 0.266,
"grad_norm": 0.7597566246986389,
"learning_rate": 2.5277460999307462e-05,
"loss": 1.5454,
"num_input_tokens_seen": 4462739456,
"step": 532
},
{
"epoch": 0.2665,
"grad_norm": 0.5779781937599182,
"learning_rate": 2.5201664339033138e-05,
"loss": 1.74,
"num_input_tokens_seen": 4471128064,
"step": 533
},
{
"epoch": 0.267,
"grad_norm": 0.7097752094268799,
"learning_rate": 2.5125787462659937e-05,
"loss": 1.6777,
"num_input_tokens_seen": 4479516672,
"step": 534
},
{
"epoch": 0.2675,
"grad_norm": 0.6358765959739685,
"learning_rate": 2.504983154030316e-05,
"loss": 1.69,
"num_input_tokens_seen": 4487905280,
"step": 535
},
{
"epoch": 0.268,
"grad_norm": 0.8313521146774292,
"learning_rate": 2.4973797743297103e-05,
"loss": 1.427,
"num_input_tokens_seen": 4496293888,
"step": 536
},
{
"epoch": 0.2685,
"grad_norm": 1.0910331010818481,
"learning_rate": 2.489768724417695e-05,
"loss": 1.7318,
"num_input_tokens_seen": 4504682496,
"step": 537
},
{
"epoch": 0.269,
"grad_norm": 0.7883654236793518,
"learning_rate": 2.4821501216660778e-05,
"loss": 1.7768,
"num_input_tokens_seen": 4513071104,
"step": 538
},
{
"epoch": 0.2695,
"grad_norm": 0.814295768737793,
"learning_rate": 2.474524083563136e-05,
"loss": 1.6363,
"num_input_tokens_seen": 4521459712,
"step": 539
},
{
"epoch": 0.27,
"grad_norm": 0.8142704367637634,
"learning_rate": 2.4668907277118114e-05,
"loss": 1.8271,
"num_input_tokens_seen": 4529848320,
"step": 540
},
{
"epoch": 0.2705,
"grad_norm": 0.6927081942558289,
"learning_rate": 2.459250171827894e-05,
"loss": 1.5607,
"num_input_tokens_seen": 4538236928,
"step": 541
},
{
"epoch": 0.271,
"grad_norm": 0.7037166357040405,
"learning_rate": 2.4516025337382078e-05,
"loss": 1.606,
"num_input_tokens_seen": 4546625536,
"step": 542
},
{
"epoch": 0.2715,
"grad_norm": 0.7414228916168213,
"learning_rate": 2.443947931378792e-05,
"loss": 1.6102,
"num_input_tokens_seen": 4555014144,
"step": 543
},
{
"epoch": 0.272,
"grad_norm": 0.734017014503479,
"learning_rate": 2.4362864827930855e-05,
"loss": 1.6308,
"num_input_tokens_seen": 4563402752,
"step": 544
},
{
"epoch": 0.2725,
"grad_norm": 0.8329829573631287,
"learning_rate": 2.4286183061301016e-05,
"loss": 1.6925,
"num_input_tokens_seen": 4571791360,
"step": 545
},
{
"epoch": 0.273,
"grad_norm": 0.598017692565918,
"learning_rate": 2.4209435196426112e-05,
"loss": 1.4737,
"num_input_tokens_seen": 4580179968,
"step": 546
},
{
"epoch": 0.2735,
"grad_norm": 0.6858911514282227,
"learning_rate": 2.4132622416853164e-05,
"loss": 1.6603,
"num_input_tokens_seen": 4588568576,
"step": 547
},
{
"epoch": 0.274,
"grad_norm": 0.6639436483383179,
"learning_rate": 2.405574590713025e-05,
"loss": 1.5032,
"num_input_tokens_seen": 4596957184,
"step": 548
},
{
"epoch": 0.2745,
"grad_norm": 0.6341187357902527,
"learning_rate": 2.3978806852788253e-05,
"loss": 1.7347,
"num_input_tokens_seen": 4605345792,
"step": 549
},
{
"epoch": 0.275,
"grad_norm": 0.5443724393844604,
"learning_rate": 2.390180644032257e-05,
"loss": 1.6864,
"num_input_tokens_seen": 4613734400,
"step": 550
},
{
"epoch": 0.2755,
"grad_norm": 0.6698442101478577,
"learning_rate": 2.382474585717481e-05,
"loss": 1.7269,
"num_input_tokens_seen": 4622123008,
"step": 551
},
{
"epoch": 0.276,
"grad_norm": 0.508804976940155,
"learning_rate": 2.37476262917145e-05,
"loss": 1.6529,
"num_input_tokens_seen": 4630511616,
"step": 552
},
{
"epoch": 0.2765,
"grad_norm": 0.6291599869728088,
"learning_rate": 2.3670448933220732e-05,
"loss": 1.6619,
"num_input_tokens_seen": 4638900224,
"step": 553
},
{
"epoch": 0.277,
"grad_norm": 0.591014564037323,
"learning_rate": 2.3593214971863857e-05,
"loss": 1.6256,
"num_input_tokens_seen": 4647288832,
"step": 554
},
{
"epoch": 0.2775,
"grad_norm": 0.5783071517944336,
"learning_rate": 2.3515925598687097e-05,
"loss": 1.74,
"num_input_tokens_seen": 4655677440,
"step": 555
},
{
"epoch": 0.278,
"grad_norm": 0.539421558380127,
"learning_rate": 2.3438582005588192e-05,
"loss": 1.6832,
"num_input_tokens_seen": 4664066048,
"step": 556
},
{
"epoch": 0.2785,
"grad_norm": 0.5550200343132019,
"learning_rate": 2.3361185385301042e-05,
"loss": 1.6972,
"num_input_tokens_seen": 4672454656,
"step": 557
},
{
"epoch": 0.279,
"grad_norm": 0.6679721474647522,
"learning_rate": 2.328373693137726e-05,
"loss": 1.6456,
"num_input_tokens_seen": 4680843264,
"step": 558
},
{
"epoch": 0.2795,
"grad_norm": 0.49843743443489075,
"learning_rate": 2.3206237838167825e-05,
"loss": 1.6452,
"num_input_tokens_seen": 4689231872,
"step": 559
},
{
"epoch": 0.28,
"grad_norm": 0.6135469079017639,
"learning_rate": 2.312868930080462e-05,
"loss": 1.6034,
"num_input_tokens_seen": 4697620480,
"step": 560
},
{
"epoch": 0.2805,
"grad_norm": 0.5259789824485779,
"learning_rate": 2.3051092515182022e-05,
"loss": 1.5512,
"num_input_tokens_seen": 4706009088,
"step": 561
},
{
"epoch": 0.281,
"grad_norm": 0.6396327614784241,
"learning_rate": 2.2973448677938466e-05,
"loss": 1.6086,
"num_input_tokens_seen": 4714397696,
"step": 562
},
{
"epoch": 0.2815,
"grad_norm": 0.523544430732727,
"learning_rate": 2.289575898643796e-05,
"loss": 1.5549,
"num_input_tokens_seen": 4722786304,
"step": 563
},
{
"epoch": 0.282,
"grad_norm": 0.5481888651847839,
"learning_rate": 2.2818024638751655e-05,
"loss": 1.6328,
"num_input_tokens_seen": 4731174912,
"step": 564
},
{
"epoch": 0.2825,
"grad_norm": 0.5783687233924866,
"learning_rate": 2.2740246833639366e-05,
"loss": 1.7459,
"num_input_tokens_seen": 4739563520,
"step": 565
},
{
"epoch": 0.283,
"grad_norm": 0.5406476259231567,
"learning_rate": 2.266242677053105e-05,
"loss": 1.481,
"num_input_tokens_seen": 4747952128,
"step": 566
},
{
"epoch": 0.2835,
"grad_norm": 0.7908449172973633,
"learning_rate": 2.2584565649508355e-05,
"loss": 1.6422,
"num_input_tokens_seen": 4756340736,
"step": 567
},
{
"epoch": 0.284,
"grad_norm": 0.7007967829704285,
"learning_rate": 2.2506664671286087e-05,
"loss": 1.6323,
"num_input_tokens_seen": 4764729344,
"step": 568
},
{
"epoch": 0.2845,
"grad_norm": 0.5286893248558044,
"learning_rate": 2.2428725037193697e-05,
"loss": 1.6692,
"num_input_tokens_seen": 4773117952,
"step": 569
},
{
"epoch": 0.285,
"grad_norm": 0.7265971899032593,
"learning_rate": 2.2350747949156756e-05,
"loss": 1.6742,
"num_input_tokens_seen": 4781506560,
"step": 570
},
{
"epoch": 0.2855,
"grad_norm": 0.592954158782959,
"learning_rate": 2.2272734609678426e-05,
"loss": 1.5685,
"num_input_tokens_seen": 4789895168,
"step": 571
},
{
"epoch": 0.286,
"grad_norm": 0.532684862613678,
"learning_rate": 2.2194686221820905e-05,
"loss": 1.6014,
"num_input_tokens_seen": 4798283776,
"step": 572
},
{
"epoch": 0.2865,
"grad_norm": 0.5546914339065552,
"learning_rate": 2.2116603989186895e-05,
"loss": 1.4608,
"num_input_tokens_seen": 4806672384,
"step": 573
},
{
"epoch": 0.287,
"grad_norm": 0.654141366481781,
"learning_rate": 2.2038489115901e-05,
"loss": 1.4977,
"num_input_tokens_seen": 4815060992,
"step": 574
},
{
"epoch": 0.2875,
"grad_norm": 0.48651596903800964,
"learning_rate": 2.196034280659122e-05,
"loss": 1.591,
"num_input_tokens_seen": 4823449600,
"step": 575
},
{
"epoch": 0.288,
"grad_norm": 0.6360925436019897,
"learning_rate": 2.1882166266370292e-05,
"loss": 1.5767,
"num_input_tokens_seen": 4831838208,
"step": 576
},
{
"epoch": 0.2885,
"grad_norm": 0.48844876885414124,
"learning_rate": 2.1803960700817185e-05,
"loss": 1.8562,
"num_input_tokens_seen": 4840226816,
"step": 577
},
{
"epoch": 0.289,
"grad_norm": 0.6248428821563721,
"learning_rate": 2.1725727315958473e-05,
"loss": 1.5894,
"num_input_tokens_seen": 4848615424,
"step": 578
},
{
"epoch": 0.2895,
"grad_norm": 0.7215993404388428,
"learning_rate": 2.1647467318249715e-05,
"loss": 1.6851,
"num_input_tokens_seen": 4857004032,
"step": 579
},
{
"epoch": 0.29,
"grad_norm": 0.6206884980201721,
"learning_rate": 2.1569181914556904e-05,
"loss": 1.5265,
"num_input_tokens_seen": 4865392640,
"step": 580
},
{
"epoch": 0.2905,
"grad_norm": 0.5436145067214966,
"learning_rate": 2.1490872312137795e-05,
"loss": 1.7174,
"num_input_tokens_seen": 4873781248,
"step": 581
},
{
"epoch": 0.291,
"grad_norm": 0.6190699338912964,
"learning_rate": 2.1412539718623337e-05,
"loss": 1.3116,
"num_input_tokens_seen": 4882169856,
"step": 582
},
{
"epoch": 0.2915,
"grad_norm": 0.6792662143707275,
"learning_rate": 2.1334185341999024e-05,
"loss": 1.5902,
"num_input_tokens_seen": 4890558464,
"step": 583
},
{
"epoch": 0.292,
"grad_norm": 0.5043581128120422,
"learning_rate": 2.125581039058627e-05,
"loss": 1.6654,
"num_input_tokens_seen": 4898947072,
"step": 584
},
{
"epoch": 0.2925,
"grad_norm": 0.5902267694473267,
"learning_rate": 2.117741607302378e-05,
"loss": 1.6671,
"num_input_tokens_seen": 4907335680,
"step": 585
},
{
"epoch": 0.293,
"grad_norm": 0.4893746078014374,
"learning_rate": 2.109900359824892e-05,
"loss": 1.6003,
"num_input_tokens_seen": 4915724288,
"step": 586
},
{
"epoch": 0.2935,
"grad_norm": 0.6055283546447754,
"learning_rate": 2.1020574175479035e-05,
"loss": 1.872,
"num_input_tokens_seen": 4924112896,
"step": 587
},
{
"epoch": 0.294,
"grad_norm": 0.5110766291618347,
"learning_rate": 2.0942129014192854e-05,
"loss": 1.5215,
"num_input_tokens_seen": 4932501504,
"step": 588
},
{
"epoch": 0.2945,
"grad_norm": 0.6477858424186707,
"learning_rate": 2.0863669324111807e-05,
"loss": 1.6343,
"num_input_tokens_seen": 4940890112,
"step": 589
},
{
"epoch": 0.295,
"grad_norm": 0.5806403160095215,
"learning_rate": 2.0785196315181374e-05,
"loss": 1.4936,
"num_input_tokens_seen": 4949278720,
"step": 590
},
{
"epoch": 0.2955,
"grad_norm": 0.638688862323761,
"learning_rate": 2.0706711197552427e-05,
"loss": 1.4357,
"num_input_tokens_seen": 4957667328,
"step": 591
},
{
"epoch": 0.296,
"grad_norm": 0.5387598872184753,
"learning_rate": 2.0628215181562567e-05,
"loss": 1.8016,
"num_input_tokens_seen": 4966055936,
"step": 592
},
{
"epoch": 0.2965,
"grad_norm": 1.0725306272506714,
"learning_rate": 2.054970947771747e-05,
"loss": 1.8565,
"num_input_tokens_seen": 4974444544,
"step": 593
},
{
"epoch": 0.297,
"grad_norm": 0.649124801158905,
"learning_rate": 2.0471195296672207e-05,
"loss": 1.5579,
"num_input_tokens_seen": 4982833152,
"step": 594
},
{
"epoch": 0.2975,
"grad_norm": 0.7250737547874451,
"learning_rate": 2.0392673849212565e-05,
"loss": 1.5504,
"num_input_tokens_seen": 4991221760,
"step": 595
},
{
"epoch": 0.298,
"grad_norm": 0.578586757183075,
"learning_rate": 2.0314146346236415e-05,
"loss": 1.5464,
"num_input_tokens_seen": 4999610368,
"step": 596
},
{
"epoch": 0.2985,
"grad_norm": 0.6575354933738708,
"learning_rate": 2.0235613998734985e-05,
"loss": 1.818,
"num_input_tokens_seen": 5007998976,
"step": 597
},
{
"epoch": 0.299,
"grad_norm": 0.4576837718486786,
"learning_rate": 2.0157078017774228e-05,
"loss": 1.7311,
"num_input_tokens_seen": 5016387584,
"step": 598
},
{
"epoch": 0.2995,
"grad_norm": 0.6546321511268616,
"learning_rate": 2.0078539614476122e-05,
"loss": 1.8055,
"num_input_tokens_seen": 5024776192,
"step": 599
},
{
"epoch": 0.3,
"grad_norm": 0.6534265279769897,
"learning_rate": 2e-05,
"loss": 1.5457,
"num_input_tokens_seen": 5033164800,
"step": 600
},
{
"epoch": 0.3005,
"grad_norm": 0.687644898891449,
"learning_rate": 1.9921460385523884e-05,
"loss": 1.5855,
"num_input_tokens_seen": 5041553408,
"step": 601
},
{
"epoch": 0.301,
"grad_norm": 0.5359103679656982,
"learning_rate": 1.9842921982225782e-05,
"loss": 1.5351,
"num_input_tokens_seen": 5049942016,
"step": 602
},
{
"epoch": 0.3015,
"grad_norm": 0.4846513569355011,
"learning_rate": 1.9764386001265015e-05,
"loss": 1.518,
"num_input_tokens_seen": 5058330624,
"step": 603
},
{
"epoch": 0.302,
"grad_norm": 0.59798663854599,
"learning_rate": 1.9685853653763592e-05,
"loss": 1.5217,
"num_input_tokens_seen": 5066719232,
"step": 604
},
{
"epoch": 0.3025,
"grad_norm": 0.45739227533340454,
"learning_rate": 1.960732615078744e-05,
"loss": 1.4634,
"num_input_tokens_seen": 5075107840,
"step": 605
},
{
"epoch": 0.303,
"grad_norm": 0.5035345554351807,
"learning_rate": 1.95288047033278e-05,
"loss": 1.5809,
"num_input_tokens_seen": 5083496448,
"step": 606
},
{
"epoch": 0.3035,
"grad_norm": 0.4467843174934387,
"learning_rate": 1.9450290522282533e-05,
"loss": 1.5883,
"num_input_tokens_seen": 5091885056,
"step": 607
},
{
"epoch": 0.304,
"grad_norm": 0.6839386224746704,
"learning_rate": 1.9371784818437436e-05,
"loss": 1.5005,
"num_input_tokens_seen": 5100273664,
"step": 608
},
{
"epoch": 0.3045,
"grad_norm": 0.4985339343547821,
"learning_rate": 1.929328880244758e-05,
"loss": 1.5321,
"num_input_tokens_seen": 5108662272,
"step": 609
},
{
"epoch": 0.305,
"grad_norm": 0.581303596496582,
"learning_rate": 1.9214803684818636e-05,
"loss": 1.5554,
"num_input_tokens_seen": 5117050880,
"step": 610
},
{
"epoch": 0.3055,
"grad_norm": 0.4922437369823456,
"learning_rate": 1.9136330675888192e-05,
"loss": 1.5549,
"num_input_tokens_seen": 5125439488,
"step": 611
},
{
"epoch": 0.306,
"grad_norm": 0.5247146487236023,
"learning_rate": 1.905787098580715e-05,
"loss": 1.9025,
"num_input_tokens_seen": 5133828096,
"step": 612
},
{
"epoch": 0.3065,
"grad_norm": 0.5503407120704651,
"learning_rate": 1.897942582452097e-05,
"loss": 1.6479,
"num_input_tokens_seen": 5142216704,
"step": 613
},
{
"epoch": 0.307,
"grad_norm": 0.5089061260223389,
"learning_rate": 1.890099640175109e-05,
"loss": 1.6435,
"num_input_tokens_seen": 5150605312,
"step": 614
},
{
"epoch": 0.3075,
"grad_norm": 0.5309303998947144,
"learning_rate": 1.882258392697622e-05,
"loss": 1.5801,
"num_input_tokens_seen": 5158993920,
"step": 615
},
{
"epoch": 0.308,
"grad_norm": 0.45839405059814453,
"learning_rate": 1.8744189609413733e-05,
"loss": 1.6478,
"num_input_tokens_seen": 5167382528,
"step": 616
},
{
"epoch": 0.3085,
"grad_norm": 0.5259501934051514,
"learning_rate": 1.8665814658000982e-05,
"loss": 1.6075,
"num_input_tokens_seen": 5175771136,
"step": 617
},
{
"epoch": 0.309,
"grad_norm": 0.3942556381225586,
"learning_rate": 1.8587460281376673e-05,
"loss": 1.7839,
"num_input_tokens_seen": 5184159744,
"step": 618
},
{
"epoch": 0.3095,
"grad_norm": 0.5850613117218018,
"learning_rate": 1.8509127687862208e-05,
"loss": 1.5596,
"num_input_tokens_seen": 5192548352,
"step": 619
},
{
"epoch": 0.31,
"grad_norm": 0.523441731929779,
"learning_rate": 1.8430818085443106e-05,
"loss": 1.5867,
"num_input_tokens_seen": 5200936960,
"step": 620
},
{
"epoch": 0.3105,
"grad_norm": 0.5120038986206055,
"learning_rate": 1.835253268175029e-05,
"loss": 1.5351,
"num_input_tokens_seen": 5209325568,
"step": 621
},
{
"epoch": 0.311,
"grad_norm": 0.5440953969955444,
"learning_rate": 1.8274272684041537e-05,
"loss": 1.5445,
"num_input_tokens_seen": 5217714176,
"step": 622
},
{
"epoch": 0.3115,
"grad_norm": 0.6131273508071899,
"learning_rate": 1.8196039299182818e-05,
"loss": 1.541,
"num_input_tokens_seen": 5226102784,
"step": 623
},
{
"epoch": 0.312,
"grad_norm": 0.4762137234210968,
"learning_rate": 1.8117833733629715e-05,
"loss": 1.5922,
"num_input_tokens_seen": 5234491392,
"step": 624
},
{
"epoch": 0.3125,
"grad_norm": 0.5485303401947021,
"learning_rate": 1.8039657193408788e-05,
"loss": 1.5588,
"num_input_tokens_seen": 5242880000,
"step": 625
},
{
"epoch": 0.313,
"grad_norm": 0.4996958374977112,
"learning_rate": 1.7961510884099005e-05,
"loss": 1.591,
"num_input_tokens_seen": 5251268608,
"step": 626
},
{
"epoch": 0.3135,
"grad_norm": 0.5148504972457886,
"learning_rate": 1.7883396010813116e-05,
"loss": 1.5337,
"num_input_tokens_seen": 5259657216,
"step": 627
},
{
"epoch": 0.314,
"grad_norm": 0.4594258666038513,
"learning_rate": 1.7805313778179095e-05,
"loss": 1.7369,
"num_input_tokens_seen": 5268045824,
"step": 628
},
{
"epoch": 0.3145,
"grad_norm": 0.5273151397705078,
"learning_rate": 1.772726539032158e-05,
"loss": 1.4446,
"num_input_tokens_seen": 5276434432,
"step": 629
},
{
"epoch": 0.315,
"grad_norm": 0.5533626079559326,
"learning_rate": 1.764925205084325e-05,
"loss": 1.5265,
"num_input_tokens_seen": 5284823040,
"step": 630
},
{
"epoch": 0.3155,
"grad_norm": 0.4903721213340759,
"learning_rate": 1.7571274962806316e-05,
"loss": 1.7801,
"num_input_tokens_seen": 5293211648,
"step": 631
},
{
"epoch": 0.316,
"grad_norm": 0.49773064255714417,
"learning_rate": 1.7493335328713913e-05,
"loss": 1.701,
"num_input_tokens_seen": 5301600256,
"step": 632
},
{
"epoch": 0.3165,
"grad_norm": 0.4572118818759918,
"learning_rate": 1.741543435049165e-05,
"loss": 1.6753,
"num_input_tokens_seen": 5309988864,
"step": 633
},
{
"epoch": 0.317,
"grad_norm": 0.5031276941299438,
"learning_rate": 1.7337573229468958e-05,
"loss": 1.5555,
"num_input_tokens_seen": 5318377472,
"step": 634
},
{
"epoch": 0.3175,
"grad_norm": 0.5255052447319031,
"learning_rate": 1.7259753166360644e-05,
"loss": 1.4977,
"num_input_tokens_seen": 5326766080,
"step": 635
},
{
"epoch": 0.318,
"grad_norm": 0.5148425698280334,
"learning_rate": 1.7181975361248348e-05,
"loss": 1.5357,
"num_input_tokens_seen": 5335154688,
"step": 636
},
{
"epoch": 0.3185,
"grad_norm": 0.5783670544624329,
"learning_rate": 1.7104241013562045e-05,
"loss": 1.6418,
"num_input_tokens_seen": 5343543296,
"step": 637
},
{
"epoch": 0.319,
"grad_norm": 0.40350809693336487,
"learning_rate": 1.702655132206154e-05,
"loss": 1.6714,
"num_input_tokens_seen": 5351931904,
"step": 638
},
{
"epoch": 0.3195,
"grad_norm": 0.49052226543426514,
"learning_rate": 1.6948907484817985e-05,
"loss": 1.5475,
"num_input_tokens_seen": 5360320512,
"step": 639
},
{
"epoch": 0.32,
"grad_norm": 0.45277076959609985,
"learning_rate": 1.687131069919538e-05,
"loss": 1.5783,
"num_input_tokens_seen": 5368709120,
"step": 640
},
{
"epoch": 0.3205,
"grad_norm": 0.5141287446022034,
"learning_rate": 1.679376216183218e-05,
"loss": 1.6031,
"num_input_tokens_seen": 5377097728,
"step": 641
},
{
"epoch": 0.321,
"grad_norm": 0.37356165051460266,
"learning_rate": 1.6716263068622744e-05,
"loss": 1.5794,
"num_input_tokens_seen": 5385486336,
"step": 642
},
{
"epoch": 0.3215,
"grad_norm": 0.4839518666267395,
"learning_rate": 1.6638814614698965e-05,
"loss": 1.6072,
"num_input_tokens_seen": 5393874944,
"step": 643
},
{
"epoch": 0.322,
"grad_norm": 0.4161027669906616,
"learning_rate": 1.6561417994411808e-05,
"loss": 1.8477,
"num_input_tokens_seen": 5402263552,
"step": 644
},
{
"epoch": 0.3225,
"grad_norm": 0.5627133846282959,
"learning_rate": 1.648407440131291e-05,
"loss": 1.629,
"num_input_tokens_seen": 5410652160,
"step": 645
},
{
"epoch": 0.323,
"grad_norm": 0.45590704679489136,
"learning_rate": 1.640678502813615e-05,
"loss": 1.6007,
"num_input_tokens_seen": 5419040768,
"step": 646
},
{
"epoch": 0.3235,
"grad_norm": 0.6110590696334839,
"learning_rate": 1.6329551066779278e-05,
"loss": 1.6822,
"num_input_tokens_seen": 5427429376,
"step": 647
},
{
"epoch": 0.324,
"grad_norm": 0.442785382270813,
"learning_rate": 1.6252373708285505e-05,
"loss": 1.8112,
"num_input_tokens_seen": 5435817984,
"step": 648
},
{
"epoch": 0.3245,
"grad_norm": 0.4794580340385437,
"learning_rate": 1.6175254142825196e-05,
"loss": 1.59,
"num_input_tokens_seen": 5444206592,
"step": 649
},
{
"epoch": 0.325,
"grad_norm": 0.4918476641178131,
"learning_rate": 1.609819355967744e-05,
"loss": 1.6212,
"num_input_tokens_seen": 5452595200,
"step": 650
},
{
"epoch": 0.3255,
"grad_norm": 0.5037944316864014,
"learning_rate": 1.602119314721175e-05,
"loss": 1.4959,
"num_input_tokens_seen": 5460983808,
"step": 651
},
{
"epoch": 0.326,
"grad_norm": 0.638948380947113,
"learning_rate": 1.5944254092869756e-05,
"loss": 1.6125,
"num_input_tokens_seen": 5469372416,
"step": 652
},
{
"epoch": 0.3265,
"grad_norm": 0.46257832646369934,
"learning_rate": 1.5867377583146836e-05,
"loss": 1.7298,
"num_input_tokens_seen": 5477761024,
"step": 653
},
{
"epoch": 0.327,
"grad_norm": 0.5825093984603882,
"learning_rate": 1.579056480357389e-05,
"loss": 1.6123,
"num_input_tokens_seen": 5486149632,
"step": 654
},
{
"epoch": 0.3275,
"grad_norm": 0.4797838628292084,
"learning_rate": 1.571381693869899e-05,
"loss": 1.6064,
"num_input_tokens_seen": 5494538240,
"step": 655
},
{
"epoch": 0.328,
"grad_norm": 0.6016621589660645,
"learning_rate": 1.5637135172069155e-05,
"loss": 1.535,
"num_input_tokens_seen": 5502926848,
"step": 656
},
{
"epoch": 0.3285,
"grad_norm": 0.48697134852409363,
"learning_rate": 1.5560520686212083e-05,
"loss": 1.7159,
"num_input_tokens_seen": 5511315456,
"step": 657
},
{
"epoch": 0.329,
"grad_norm": 0.5528718829154968,
"learning_rate": 1.548397466261793e-05,
"loss": 1.5113,
"num_input_tokens_seen": 5519704064,
"step": 658
},
{
"epoch": 0.3295,
"grad_norm": 0.48319679498672485,
"learning_rate": 1.5407498281721063e-05,
"loss": 1.6741,
"num_input_tokens_seen": 5528092672,
"step": 659
},
{
"epoch": 0.33,
"grad_norm": 0.4717366099357605,
"learning_rate": 1.53310927228819e-05,
"loss": 1.5487,
"num_input_tokens_seen": 5536481280,
"step": 660
},
{
"epoch": 0.3305,
"grad_norm": 0.4618384838104248,
"learning_rate": 1.5254759164368644e-05,
"loss": 1.7316,
"num_input_tokens_seen": 5544869888,
"step": 661
},
{
"epoch": 0.331,
"grad_norm": 0.7245141267776489,
"learning_rate": 1.517849878333923e-05,
"loss": 1.7489,
"num_input_tokens_seen": 5553258496,
"step": 662
},
{
"epoch": 0.3315,
"grad_norm": 0.41874098777770996,
"learning_rate": 1.5102312755823053e-05,
"loss": 1.6518,
"num_input_tokens_seen": 5561647104,
"step": 663
},
{
"epoch": 0.332,
"grad_norm": 0.46870699524879456,
"learning_rate": 1.5026202256702909e-05,
"loss": 1.598,
"num_input_tokens_seen": 5570035712,
"step": 664
},
{
"epoch": 0.3325,
"grad_norm": 0.37529900670051575,
"learning_rate": 1.4950168459696841e-05,
"loss": 1.6456,
"num_input_tokens_seen": 5578424320,
"step": 665
},
{
"epoch": 0.333,
"grad_norm": 0.4923308789730072,
"learning_rate": 1.4874212537340067e-05,
"loss": 1.4711,
"num_input_tokens_seen": 5586812928,
"step": 666
},
{
"epoch": 0.3335,
"grad_norm": 0.3929249048233032,
"learning_rate": 1.4798335660966869e-05,
"loss": 1.5761,
"num_input_tokens_seen": 5595201536,
"step": 667
},
{
"epoch": 0.334,
"grad_norm": 0.4999372661113739,
"learning_rate": 1.4722539000692548e-05,
"loss": 1.5069,
"num_input_tokens_seen": 5603590144,
"step": 668
},
{
"epoch": 0.3345,
"grad_norm": 0.44993501901626587,
"learning_rate": 1.4646823725395351e-05,
"loss": 1.6068,
"num_input_tokens_seen": 5611978752,
"step": 669
},
{
"epoch": 0.335,
"grad_norm": 0.5363733172416687,
"learning_rate": 1.4571191002698517e-05,
"loss": 1.3344,
"num_input_tokens_seen": 5620367360,
"step": 670
},
{
"epoch": 0.3355,
"grad_norm": 0.5620055198669434,
"learning_rate": 1.4495641998952172e-05,
"loss": 1.5378,
"num_input_tokens_seen": 5628755968,
"step": 671
},
{
"epoch": 0.336,
"grad_norm": 0.4853207468986511,
"learning_rate": 1.4420177879215419e-05,
"loss": 1.5294,
"num_input_tokens_seen": 5637144576,
"step": 672
},
{
"epoch": 0.3365,
"grad_norm": 0.4831787645816803,
"learning_rate": 1.434479980723833e-05,
"loss": 1.5064,
"num_input_tokens_seen": 5645533184,
"step": 673
},
{
"epoch": 0.337,
"grad_norm": 0.6178323030471802,
"learning_rate": 1.4269508945444033e-05,
"loss": 1.5201,
"num_input_tokens_seen": 5653921792,
"step": 674
},
{
"epoch": 0.3375,
"grad_norm": 0.4733405113220215,
"learning_rate": 1.4194306454910757e-05,
"loss": 1.6281,
"num_input_tokens_seen": 5662310400,
"step": 675
},
{
"epoch": 0.338,
"grad_norm": 0.5506306886672974,
"learning_rate": 1.4119193495353925e-05,
"loss": 1.3539,
"num_input_tokens_seen": 5670699008,
"step": 676
},
{
"epoch": 0.3385,
"grad_norm": 0.5859827995300293,
"learning_rate": 1.40441712251083e-05,
"loss": 1.6797,
"num_input_tokens_seen": 5679087616,
"step": 677
},
{
"epoch": 0.339,
"grad_norm": 0.5389025211334229,
"learning_rate": 1.3969240801110088e-05,
"loss": 1.5045,
"num_input_tokens_seen": 5687476224,
"step": 678
},
{
"epoch": 0.3395,
"grad_norm": 0.5681171417236328,
"learning_rate": 1.3894403378879132e-05,
"loss": 1.56,
"num_input_tokens_seen": 5695864832,
"step": 679
},
{
"epoch": 0.34,
"grad_norm": 0.5575128197669983,
"learning_rate": 1.3819660112501054e-05,
"loss": 1.4456,
"num_input_tokens_seen": 5704253440,
"step": 680
},
{
"epoch": 0.3405,
"grad_norm": 0.4929724335670471,
"learning_rate": 1.3745012154609492e-05,
"loss": 1.4229,
"num_input_tokens_seen": 5712642048,
"step": 681
},
{
"epoch": 0.341,
"grad_norm": 0.44603395462036133,
"learning_rate": 1.3670460656368278e-05,
"loss": 1.6888,
"num_input_tokens_seen": 5721030656,
"step": 682
},
{
"epoch": 0.3415,
"grad_norm": 0.4557688236236572,
"learning_rate": 1.3596006767453766e-05,
"loss": 1.6441,
"num_input_tokens_seen": 5729419264,
"step": 683
},
{
"epoch": 0.342,
"grad_norm": 0.42381447553634644,
"learning_rate": 1.3521651636037017e-05,
"loss": 1.7471,
"num_input_tokens_seen": 5737807872,
"step": 684
},
{
"epoch": 0.3425,
"grad_norm": 0.4877522587776184,
"learning_rate": 1.3447396408766134e-05,
"loss": 1.6108,
"num_input_tokens_seen": 5746196480,
"step": 685
},
{
"epoch": 0.343,
"grad_norm": 0.5389087796211243,
"learning_rate": 1.3373242230748579e-05,
"loss": 1.4052,
"num_input_tokens_seen": 5754585088,
"step": 686
},
{
"epoch": 0.3435,
"grad_norm": 0.5732712745666504,
"learning_rate": 1.3299190245533522e-05,
"loss": 1.6114,
"num_input_tokens_seen": 5762973696,
"step": 687
},
{
"epoch": 0.344,
"grad_norm": 0.37976858019828796,
"learning_rate": 1.3225241595094173e-05,
"loss": 1.6381,
"num_input_tokens_seen": 5771362304,
"step": 688
},
{
"epoch": 0.3445,
"grad_norm": 0.6109358668327332,
"learning_rate": 1.3151397419810207e-05,
"loss": 1.4704,
"num_input_tokens_seen": 5779750912,
"step": 689
},
{
"epoch": 0.345,
"grad_norm": 0.3784377872943878,
"learning_rate": 1.3077658858450137e-05,
"loss": 1.7119,
"num_input_tokens_seen": 5788139520,
"step": 690
},
{
"epoch": 0.3455,
"grad_norm": 0.534516453742981,
"learning_rate": 1.3004027048153826e-05,
"loss": 1.4831,
"num_input_tokens_seen": 5796528128,
"step": 691
},
{
"epoch": 0.346,
"grad_norm": 0.44002264738082886,
"learning_rate": 1.2930503124414862e-05,
"loss": 1.6671,
"num_input_tokens_seen": 5804916736,
"step": 692
},
{
"epoch": 0.3465,
"grad_norm": 0.47504231333732605,
"learning_rate": 1.2857088221063099e-05,
"loss": 1.7426,
"num_input_tokens_seen": 5813305344,
"step": 693
},
{
"epoch": 0.347,
"grad_norm": 0.4503721296787262,
"learning_rate": 1.2783783470247164e-05,
"loss": 1.7178,
"num_input_tokens_seen": 5821693952,
"step": 694
},
{
"epoch": 0.3475,
"grad_norm": 0.38480740785598755,
"learning_rate": 1.2710590002417008e-05,
"loss": 1.5611,
"num_input_tokens_seen": 5830082560,
"step": 695
},
{
"epoch": 0.348,
"grad_norm": 0.4555624723434448,
"learning_rate": 1.2637508946306443e-05,
"loss": 1.5239,
"num_input_tokens_seen": 5838471168,
"step": 696
},
{
"epoch": 0.3485,
"grad_norm": 0.38544774055480957,
"learning_rate": 1.2564541428915762e-05,
"loss": 1.5648,
"num_input_tokens_seen": 5846859776,
"step": 697
},
{
"epoch": 0.349,
"grad_norm": 0.4004034399986267,
"learning_rate": 1.2491688575494337e-05,
"loss": 1.8762,
"num_input_tokens_seen": 5855248384,
"step": 698
},
{
"epoch": 0.3495,
"grad_norm": 0.4557759761810303,
"learning_rate": 1.2418951509523312e-05,
"loss": 1.7131,
"num_input_tokens_seen": 5863636992,
"step": 699
},
{
"epoch": 0.35,
"grad_norm": 0.4302028715610504,
"learning_rate": 1.2346331352698206e-05,
"loss": 1.4877,
"num_input_tokens_seen": 5872025600,
"step": 700
},
{
"epoch": 0.3505,
"grad_norm": 0.4745676517486572,
"learning_rate": 1.2273829224911685e-05,
"loss": 1.5291,
"num_input_tokens_seen": 5880414208,
"step": 701
},
{
"epoch": 0.351,
"grad_norm": 0.4593994617462158,
"learning_rate": 1.2201446244236242e-05,
"loss": 1.496,
"num_input_tokens_seen": 5888802816,
"step": 702
},
{
"epoch": 0.3515,
"grad_norm": 0.4248128831386566,
"learning_rate": 1.2129183526906971e-05,
"loss": 1.5912,
"num_input_tokens_seen": 5897191424,
"step": 703
},
{
"epoch": 0.352,
"grad_norm": 0.4090263545513153,
"learning_rate": 1.205704218730439e-05,
"loss": 1.6625,
"num_input_tokens_seen": 5905580032,
"step": 704
},
{
"epoch": 0.3525,
"grad_norm": 0.4674663841724396,
"learning_rate": 1.1985023337937185e-05,
"loss": 1.6483,
"num_input_tokens_seen": 5913968640,
"step": 705
},
{
"epoch": 0.353,
"grad_norm": 0.41158390045166016,
"learning_rate": 1.1913128089425103e-05,
"loss": 1.4624,
"num_input_tokens_seen": 5922357248,
"step": 706
},
{
"epoch": 0.3535,
"grad_norm": 0.37938833236694336,
"learning_rate": 1.1841357550481817e-05,
"loss": 1.5231,
"num_input_tokens_seen": 5930745856,
"step": 707
},
{
"epoch": 0.354,
"grad_norm": 0.5211839079856873,
"learning_rate": 1.1769712827897825e-05,
"loss": 1.6377,
"num_input_tokens_seen": 5939134464,
"step": 708
},
{
"epoch": 0.3545,
"grad_norm": 0.4222390055656433,
"learning_rate": 1.1698195026523379e-05,
"loss": 1.5385,
"num_input_tokens_seen": 5947523072,
"step": 709
},
{
"epoch": 0.355,
"grad_norm": 0.5263103246688843,
"learning_rate": 1.1626805249251444e-05,
"loss": 1.432,
"num_input_tokens_seen": 5955911680,
"step": 710
},
{
"epoch": 0.3555,
"grad_norm": 0.40875500440597534,
"learning_rate": 1.1555544597000693e-05,
"loss": 1.5782,
"num_input_tokens_seen": 5964300288,
"step": 711
},
{
"epoch": 0.356,
"grad_norm": 0.4516502618789673,
"learning_rate": 1.1484414168698547e-05,
"loss": 1.7217,
"num_input_tokens_seen": 5972688896,
"step": 712
},
{
"epoch": 0.3565,
"grad_norm": 0.4005357623100281,
"learning_rate": 1.1413415061264205e-05,
"loss": 1.5356,
"num_input_tokens_seen": 5981077504,
"step": 713
},
{
"epoch": 0.357,
"grad_norm": 0.4757128059864044,
"learning_rate": 1.134254836959173e-05,
"loss": 1.5754,
"num_input_tokens_seen": 5989466112,
"step": 714
},
{
"epoch": 0.3575,
"grad_norm": 0.3795611560344696,
"learning_rate": 1.1271815186533156e-05,
"loss": 1.5715,
"num_input_tokens_seen": 5997854720,
"step": 715
},
{
"epoch": 0.358,
"grad_norm": 0.48202642798423767,
"learning_rate": 1.1201216602881696e-05,
"loss": 1.474,
"num_input_tokens_seen": 6006243328,
"step": 716
},
{
"epoch": 0.3585,
"grad_norm": 0.4397919178009033,
"learning_rate": 1.1130753707354836e-05,
"loss": 1.4755,
"num_input_tokens_seen": 6014631936,
"step": 717
},
{
"epoch": 0.359,
"grad_norm": 0.4739425778388977,
"learning_rate": 1.106042758657758e-05,
"loss": 1.5371,
"num_input_tokens_seen": 6023020544,
"step": 718
},
{
"epoch": 0.3595,
"grad_norm": 0.477103590965271,
"learning_rate": 1.0990239325065714e-05,
"loss": 1.6359,
"num_input_tokens_seen": 6031409152,
"step": 719
},
{
"epoch": 0.36,
"grad_norm": 0.40918999910354614,
"learning_rate": 1.0920190005209066e-05,
"loss": 1.5021,
"num_input_tokens_seen": 6039797760,
"step": 720
},
{
"epoch": 0.3605,
"grad_norm": 0.5159934163093567,
"learning_rate": 1.085028070725479e-05,
"loss": 1.6418,
"num_input_tokens_seen": 6048186368,
"step": 721
},
{
"epoch": 0.361,
"grad_norm": 0.7591975331306458,
"learning_rate": 1.0780512509290758e-05,
"loss": 1.7691,
"num_input_tokens_seen": 6056574976,
"step": 722
},
{
"epoch": 0.3615,
"grad_norm": 0.5116597414016724,
"learning_rate": 1.0710886487228868e-05,
"loss": 1.6482,
"num_input_tokens_seen": 6064963584,
"step": 723
},
{
"epoch": 0.362,
"grad_norm": 0.41808661818504333,
"learning_rate": 1.0641403714788537e-05,
"loss": 1.4123,
"num_input_tokens_seen": 6073352192,
"step": 724
},
{
"epoch": 0.3625,
"grad_norm": 0.5217900276184082,
"learning_rate": 1.0572065263480046e-05,
"loss": 1.5236,
"num_input_tokens_seen": 6081740800,
"step": 725
},
{
"epoch": 0.363,
"grad_norm": 0.4180975556373596,
"learning_rate": 1.0502872202588113e-05,
"loss": 1.6335,
"num_input_tokens_seen": 6090129408,
"step": 726
},
{
"epoch": 0.3635,
"grad_norm": 0.513573944568634,
"learning_rate": 1.043382559915532e-05,
"loss": 1.5707,
"num_input_tokens_seen": 6098518016,
"step": 727
},
{
"epoch": 0.364,
"grad_norm": 0.4156613051891327,
"learning_rate": 1.0364926517965693e-05,
"loss": 1.5941,
"num_input_tokens_seen": 6106906624,
"step": 728
},
{
"epoch": 0.3645,
"grad_norm": 0.43025484681129456,
"learning_rate": 1.0296176021528326e-05,
"loss": 1.6518,
"num_input_tokens_seen": 6115295232,
"step": 729
},
{
"epoch": 0.365,
"grad_norm": 0.4618057906627655,
"learning_rate": 1.0227575170060909e-05,
"loss": 1.4235,
"num_input_tokens_seen": 6123683840,
"step": 730
},
{
"epoch": 0.3655,
"grad_norm": 0.3291275203227997,
"learning_rate": 1.0159125021473421e-05,
"loss": 1.6211,
"num_input_tokens_seen": 6132072448,
"step": 731
},
{
"epoch": 0.366,
"grad_norm": 0.41411274671554565,
"learning_rate": 1.009082663135185e-05,
"loss": 1.5563,
"num_input_tokens_seen": 6140461056,
"step": 732
},
{
"epoch": 0.3665,
"grad_norm": 0.3771957457065582,
"learning_rate": 1.0022681052941856e-05,
"loss": 1.6889,
"num_input_tokens_seen": 6148849664,
"step": 733
},
{
"epoch": 0.367,
"grad_norm": 0.4875394105911255,
"learning_rate": 9.95468933713255e-06,
"loss": 1.3924,
"num_input_tokens_seen": 6157238272,
"step": 734
},
{
"epoch": 0.3675,
"grad_norm": 0.421825647354126,
"learning_rate": 9.886852532440312e-06,
"loss": 1.8188,
"num_input_tokens_seen": 6165626880,
"step": 735
},
{
"epoch": 0.368,
"grad_norm": 0.4703611433506012,
"learning_rate": 9.819171684992575e-06,
"loss": 1.6558,
"num_input_tokens_seen": 6174015488,
"step": 736
},
{
"epoch": 0.3685,
"grad_norm": 0.49299588799476624,
"learning_rate": 9.751647838511747e-06,
"loss": 1.6531,
"num_input_tokens_seen": 6182404096,
"step": 737
},
{
"epoch": 0.369,
"grad_norm": 0.4184141457080841,
"learning_rate": 9.684282034299053e-06,
"loss": 1.4939,
"num_input_tokens_seen": 6190792704,
"step": 738
},
{
"epoch": 0.3695,
"grad_norm": 0.4581114947795868,
"learning_rate": 9.61707531121855e-06,
"loss": 1.4902,
"num_input_tokens_seen": 6199181312,
"step": 739
},
{
"epoch": 0.37,
"grad_norm": 0.37457022070884705,
"learning_rate": 9.550028705681024e-06,
"loss": 1.5622,
"num_input_tokens_seen": 6207569920,
"step": 740
},
{
"epoch": 0.3705,
"grad_norm": 0.4025091826915741,
"learning_rate": 9.483143251628088e-06,
"loss": 1.6402,
"num_input_tokens_seen": 6215958528,
"step": 741
},
{
"epoch": 0.371,
"grad_norm": 0.4053475856781006,
"learning_rate": 9.416419980516192e-06,
"loss": 1.6449,
"num_input_tokens_seen": 6224347136,
"step": 742
},
{
"epoch": 0.3715,
"grad_norm": 0.3622041940689087,
"learning_rate": 9.349859921300704e-06,
"loss": 1.426,
"num_input_tokens_seen": 6232735744,
"step": 743
},
{
"epoch": 0.372,
"grad_norm": 0.4312250316143036,
"learning_rate": 9.283464100420064e-06,
"loss": 1.6787,
"num_input_tokens_seen": 6241124352,
"step": 744
},
{
"epoch": 0.3725,
"grad_norm": 0.39573124051094055,
"learning_rate": 9.217233541779995e-06,
"loss": 1.4883,
"num_input_tokens_seen": 6249512960,
"step": 745
},
{
"epoch": 0.373,
"grad_norm": 0.4040946960449219,
"learning_rate": 9.15116926673763e-06,
"loss": 1.6978,
"num_input_tokens_seen": 6257901568,
"step": 746
},
{
"epoch": 0.3735,
"grad_norm": 0.412345826625824,
"learning_rate": 9.085272294085803e-06,
"loss": 1.6549,
"num_input_tokens_seen": 6266290176,
"step": 747
},
{
"epoch": 0.374,
"grad_norm": 0.4017808139324188,
"learning_rate": 9.019543640037363e-06,
"loss": 1.4813,
"num_input_tokens_seen": 6274678784,
"step": 748
},
{
"epoch": 0.3745,
"grad_norm": 0.40174025297164917,
"learning_rate": 8.95398431820947e-06,
"loss": 1.5915,
"num_input_tokens_seen": 6283067392,
"step": 749
},
{
"epoch": 0.375,
"grad_norm": 0.37170112133026123,
"learning_rate": 8.888595339607961e-06,
"loss": 1.6085,
"num_input_tokens_seen": 6291456000,
"step": 750
},
{
"epoch": 0.3755,
"grad_norm": 0.3888947069644928,
"learning_rate": 8.82337771261177e-06,
"loss": 1.4901,
"num_input_tokens_seen": 6299844608,
"step": 751
},
{
"epoch": 0.376,
"grad_norm": 0.39092621207237244,
"learning_rate": 8.758332442957394e-06,
"loss": 1.4806,
"num_input_tokens_seen": 6308233216,
"step": 752
},
{
"epoch": 0.3765,
"grad_norm": 0.4453209638595581,
"learning_rate": 8.693460533723346e-06,
"loss": 1.6993,
"num_input_tokens_seen": 6316621824,
"step": 753
},
{
"epoch": 0.377,
"grad_norm": 0.3836767077445984,
"learning_rate": 8.62876298531472e-06,
"loss": 1.6511,
"num_input_tokens_seen": 6325010432,
"step": 754
},
{
"epoch": 0.3775,
"grad_norm": 0.3998052775859833,
"learning_rate": 8.564240795447758e-06,
"loss": 1.5698,
"num_input_tokens_seen": 6333399040,
"step": 755
},
{
"epoch": 0.378,
"grad_norm": 0.3738134503364563,
"learning_rate": 8.499894959134436e-06,
"loss": 1.6333,
"num_input_tokens_seen": 6341787648,
"step": 756
},
{
"epoch": 0.3785,
"grad_norm": 0.36466184258461,
"learning_rate": 8.435726468667135e-06,
"loss": 1.5178,
"num_input_tokens_seen": 6350176256,
"step": 757
},
{
"epoch": 0.379,
"grad_norm": 0.4085821807384491,
"learning_rate": 8.37173631360339e-06,
"loss": 1.6823,
"num_input_tokens_seen": 6358564864,
"step": 758
},
{
"epoch": 0.3795,
"grad_norm": 0.3954522907733917,
"learning_rate": 8.307925480750535e-06,
"loss": 1.4361,
"num_input_tokens_seen": 6366953472,
"step": 759
},
{
"epoch": 0.38,
"grad_norm": 0.4430011808872223,
"learning_rate": 8.24429495415054e-06,
"loss": 1.6206,
"num_input_tokens_seen": 6375342080,
"step": 760
},
{
"epoch": 0.3805,
"grad_norm": 0.4140196740627289,
"learning_rate": 8.180845715064851e-06,
"loss": 1.4822,
"num_input_tokens_seen": 6383730688,
"step": 761
},
{
"epoch": 0.381,
"grad_norm": 0.3814021944999695,
"learning_rate": 8.117578741959232e-06,
"loss": 1.584,
"num_input_tokens_seen": 6392119296,
"step": 762
},
{
"epoch": 0.3815,
"grad_norm": 0.3553796410560608,
"learning_rate": 8.054495010488658e-06,
"loss": 1.556,
"num_input_tokens_seen": 6400507904,
"step": 763
},
{
"epoch": 0.382,
"grad_norm": 0.38275453448295593,
"learning_rate": 7.991595493482323e-06,
"loss": 1.4992,
"num_input_tokens_seen": 6408896512,
"step": 764
},
{
"epoch": 0.3825,
"grad_norm": 0.36514538526535034,
"learning_rate": 7.928881160928572e-06,
"loss": 1.5722,
"num_input_tokens_seen": 6417285120,
"step": 765
},
{
"epoch": 0.383,
"grad_norm": 0.4606564939022064,
"learning_rate": 7.86635297996001e-06,
"loss": 1.5389,
"num_input_tokens_seen": 6425673728,
"step": 766
},
{
"epoch": 0.3835,
"grad_norm": 0.40744417905807495,
"learning_rate": 7.804011914838524e-06,
"loss": 1.4212,
"num_input_tokens_seen": 6434062336,
"step": 767
},
{
"epoch": 0.384,
"grad_norm": 0.42578741908073425,
"learning_rate": 7.741858926940475e-06,
"loss": 1.5838,
"num_input_tokens_seen": 6442450944,
"step": 768
},
{
"epoch": 0.3845,
"grad_norm": 0.45914557576179504,
"learning_rate": 7.679894974741807e-06,
"loss": 1.633,
"num_input_tokens_seen": 6450839552,
"step": 769
},
{
"epoch": 0.385,
"grad_norm": 0.40932121872901917,
"learning_rate": 7.618121013803319e-06,
"loss": 1.422,
"num_input_tokens_seen": 6459228160,
"step": 770
},
{
"epoch": 0.3855,
"grad_norm": 0.35300320386886597,
"learning_rate": 7.556537996755919e-06,
"loss": 1.5215,
"num_input_tokens_seen": 6467616768,
"step": 771
},
{
"epoch": 0.386,
"grad_norm": 0.3876708149909973,
"learning_rate": 7.495146873285904e-06,
"loss": 1.6108,
"num_input_tokens_seen": 6476005376,
"step": 772
},
{
"epoch": 0.3865,
"grad_norm": 0.3526962697505951,
"learning_rate": 7.433948590120326e-06,
"loss": 1.5735,
"num_input_tokens_seen": 6484393984,
"step": 773
},
{
"epoch": 0.387,
"grad_norm": 0.377326101064682,
"learning_rate": 7.3729440910124464e-06,
"loss": 1.5143,
"num_input_tokens_seen": 6492782592,
"step": 774
},
{
"epoch": 0.3875,
"grad_norm": 0.3809109628200531,
"learning_rate": 7.312134316727093e-06,
"loss": 1.5331,
"num_input_tokens_seen": 6501171200,
"step": 775
},
{
"epoch": 0.388,
"grad_norm": 0.41740885376930237,
"learning_rate": 7.251520205026206e-06,
"loss": 1.7121,
"num_input_tokens_seen": 6509559808,
"step": 776
},
{
"epoch": 0.3885,
"grad_norm": 0.375685453414917,
"learning_rate": 7.191102690654384e-06,
"loss": 1.4729,
"num_input_tokens_seen": 6517948416,
"step": 777
},
{
"epoch": 0.389,
"grad_norm": 0.3461897671222687,
"learning_rate": 7.130882705324422e-06,
"loss": 1.4276,
"num_input_tokens_seen": 6526337024,
"step": 778
},
{
"epoch": 0.3895,
"grad_norm": 0.3789466619491577,
"learning_rate": 7.070861177703006e-06,
"loss": 1.5973,
"num_input_tokens_seen": 6534725632,
"step": 779
},
{
"epoch": 0.39,
"grad_norm": 0.40518102049827576,
"learning_rate": 7.01103903339633e-06,
"loss": 1.4118,
"num_input_tokens_seen": 6543114240,
"step": 780
},
{
"epoch": 0.3905,
"grad_norm": 0.3697455823421478,
"learning_rate": 6.95141719493587e-06,
"loss": 1.6321,
"num_input_tokens_seen": 6551502848,
"step": 781
},
{
"epoch": 0.391,
"grad_norm": 0.39922866225242615,
"learning_rate": 6.891996581764124e-06,
"loss": 1.5606,
"num_input_tokens_seen": 6559891456,
"step": 782
},
{
"epoch": 0.3915,
"grad_norm": 0.3575364947319031,
"learning_rate": 6.832778110220457e-06,
"loss": 1.5569,
"num_input_tokens_seen": 6568280064,
"step": 783
},
{
"epoch": 0.392,
"grad_norm": 0.36905914545059204,
"learning_rate": 6.773762693526967e-06,
"loss": 1.5744,
"num_input_tokens_seen": 6576668672,
"step": 784
},
{
"epoch": 0.3925,
"grad_norm": 0.5625297427177429,
"learning_rate": 6.7149512417743725e-06,
"loss": 1.3769,
"num_input_tokens_seen": 6585057280,
"step": 785
},
{
"epoch": 0.393,
"grad_norm": 0.3726418614387512,
"learning_rate": 6.656344661908003e-06,
"loss": 1.6744,
"num_input_tokens_seen": 6593445888,
"step": 786
},
{
"epoch": 0.3935,
"grad_norm": 0.39471179246902466,
"learning_rate": 6.597943857713849e-06,
"loss": 1.5823,
"num_input_tokens_seen": 6601834496,
"step": 787
},
{
"epoch": 0.394,
"grad_norm": 0.4886147975921631,
"learning_rate": 6.539749729804539e-06,
"loss": 1.4887,
"num_input_tokens_seen": 6610223104,
"step": 788
},
{
"epoch": 0.3945,
"grad_norm": 0.3547024130821228,
"learning_rate": 6.4817631756055086e-06,
"loss": 1.569,
"num_input_tokens_seen": 6618611712,
"step": 789
},
{
"epoch": 0.395,
"grad_norm": 0.4050310552120209,
"learning_rate": 6.423985089341165e-06,
"loss": 1.5851,
"num_input_tokens_seen": 6627000320,
"step": 790
},
{
"epoch": 0.3955,
"grad_norm": 0.36963605880737305,
"learning_rate": 6.366416362021077e-06,
"loss": 1.4311,
"num_input_tokens_seen": 6635388928,
"step": 791
},
{
"epoch": 0.396,
"grad_norm": 0.3320269286632538,
"learning_rate": 6.3090578814262256e-06,
"loss": 1.6496,
"num_input_tokens_seen": 6643777536,
"step": 792
},
{
"epoch": 0.3965,
"grad_norm": 0.4373522996902466,
"learning_rate": 6.251910532095349e-06,
"loss": 1.464,
"num_input_tokens_seen": 6652166144,
"step": 793
},
{
"epoch": 0.397,
"grad_norm": 0.3273358941078186,
"learning_rate": 6.1949751953112565e-06,
"loss": 1.5889,
"num_input_tokens_seen": 6660554752,
"step": 794
},
{
"epoch": 0.3975,
"grad_norm": 0.361979216337204,
"learning_rate": 6.138252749087286e-06,
"loss": 1.5708,
"num_input_tokens_seen": 6668943360,
"step": 795
},
{
"epoch": 0.398,
"grad_norm": 0.3504394292831421,
"learning_rate": 6.081744068153714e-06,
"loss": 1.7071,
"num_input_tokens_seen": 6677331968,
"step": 796
},
{
"epoch": 0.3985,
"grad_norm": 0.35012826323509216,
"learning_rate": 6.02545002394432e-06,
"loss": 1.5771,
"num_input_tokens_seen": 6685720576,
"step": 797
},
{
"epoch": 0.399,
"grad_norm": 0.34323230385780334,
"learning_rate": 5.969371484582887e-06,
"loss": 1.5181,
"num_input_tokens_seen": 6694109184,
"step": 798
},
{
"epoch": 0.3995,
"grad_norm": 0.3410869836807251,
"learning_rate": 5.913509314869874e-06,
"loss": 1.5847,
"num_input_tokens_seen": 6702497792,
"step": 799
},
{
"epoch": 0.4,
"grad_norm": 0.3655712604522705,
"learning_rate": 5.857864376269051e-06,
"loss": 1.5978,
"num_input_tokens_seen": 6710886400,
"step": 800
},
{
"epoch": 0.4005,
"grad_norm": 0.3299401104450226,
"learning_rate": 5.802437526894198e-06,
"loss": 1.5185,
"num_input_tokens_seen": 6719275008,
"step": 801
},
{
"epoch": 0.401,
"grad_norm": 0.33015549182891846,
"learning_rate": 5.747229621495893e-06,
"loss": 1.4544,
"num_input_tokens_seen": 6727663616,
"step": 802
},
{
"epoch": 0.4015,
"grad_norm": 0.28621819615364075,
"learning_rate": 5.692241511448342e-06,
"loss": 1.6958,
"num_input_tokens_seen": 6736052224,
"step": 803
},
{
"epoch": 0.402,
"grad_norm": 0.3436781167984009,
"learning_rate": 5.637474044736227e-06,
"loss": 1.4529,
"num_input_tokens_seen": 6744440832,
"step": 804
},
{
"epoch": 0.4025,
"grad_norm": 0.3146877884864807,
"learning_rate": 5.582928065941624e-06,
"loss": 1.6761,
"num_input_tokens_seen": 6752829440,
"step": 805
},
{
"epoch": 0.403,
"grad_norm": 0.3468015789985657,
"learning_rate": 5.528604416231016e-06,
"loss": 1.5827,
"num_input_tokens_seen": 6761218048,
"step": 806
},
{
"epoch": 0.4035,
"grad_norm": 0.30543383955955505,
"learning_rate": 5.474503933342272e-06,
"loss": 1.6216,
"num_input_tokens_seen": 6769606656,
"step": 807
},
{
"epoch": 0.404,
"grad_norm": 0.33638039231300354,
"learning_rate": 5.4206274515717735e-06,
"loss": 1.7167,
"num_input_tokens_seen": 6777995264,
"step": 808
},
{
"epoch": 0.4045,
"grad_norm": 0.2884032428264618,
"learning_rate": 5.366975801761507e-06,
"loss": 1.4414,
"num_input_tokens_seen": 6786383872,
"step": 809
},
{
"epoch": 0.405,
"grad_norm": 0.3318146765232086,
"learning_rate": 5.313549811286294e-06,
"loss": 1.712,
"num_input_tokens_seen": 6794772480,
"step": 810
},
{
"epoch": 0.4055,
"grad_norm": 0.3560062646865845,
"learning_rate": 5.260350304040987e-06,
"loss": 1.5902,
"num_input_tokens_seen": 6803161088,
"step": 811
},
{
"epoch": 0.406,
"grad_norm": 0.31840780377388,
"learning_rate": 5.207378100427804e-06,
"loss": 1.5372,
"num_input_tokens_seen": 6811549696,
"step": 812
},
{
"epoch": 0.4065,
"grad_norm": 0.3286549150943756,
"learning_rate": 5.154634017343662e-06,
"loss": 1.6759,
"num_input_tokens_seen": 6819938304,
"step": 813
},
{
"epoch": 0.407,
"grad_norm": 0.3250563144683838,
"learning_rate": 5.102118868167565e-06,
"loss": 1.5406,
"num_input_tokens_seen": 6828326912,
"step": 814
},
{
"epoch": 0.4075,
"grad_norm": 0.3316640555858612,
"learning_rate": 5.049833462748061e-06,
"loss": 1.7991,
"num_input_tokens_seen": 6836715520,
"step": 815
},
{
"epoch": 0.408,
"grad_norm": 0.3580614924430847,
"learning_rate": 4.997778607390809e-06,
"loss": 1.6432,
"num_input_tokens_seen": 6845104128,
"step": 816
},
{
"epoch": 0.4085,
"grad_norm": 0.3245338499546051,
"learning_rate": 4.945955104846061e-06,
"loss": 1.6115,
"num_input_tokens_seen": 6853492736,
"step": 817
},
{
"epoch": 0.409,
"grad_norm": 0.3292555510997772,
"learning_rate": 4.89436375429633e-06,
"loss": 1.3836,
"num_input_tokens_seen": 6861881344,
"step": 818
},
{
"epoch": 0.4095,
"grad_norm": 0.33106541633605957,
"learning_rate": 4.843005351344065e-06,
"loss": 1.4818,
"num_input_tokens_seen": 6870269952,
"step": 819
},
{
"epoch": 0.41,
"grad_norm": 0.4454377293586731,
"learning_rate": 4.791880687999382e-06,
"loss": 1.4974,
"num_input_tokens_seen": 6878658560,
"step": 820
},
{
"epoch": 0.4105,
"grad_norm": 0.3861941993236542,
"learning_rate": 4.740990552667823e-06,
"loss": 1.5086,
"num_input_tokens_seen": 6887047168,
"step": 821
},
{
"epoch": 0.411,
"grad_norm": 0.3597433865070343,
"learning_rate": 4.6903357301382405e-06,
"loss": 1.515,
"num_input_tokens_seen": 6895435776,
"step": 822
},
{
"epoch": 0.4115,
"grad_norm": 0.35469549894332886,
"learning_rate": 4.639917001570644e-06,
"loss": 1.635,
"num_input_tokens_seen": 6903824384,
"step": 823
},
{
"epoch": 0.412,
"grad_norm": 0.33241182565689087,
"learning_rate": 4.589735144484217e-06,
"loss": 1.6323,
"num_input_tokens_seen": 6912212992,
"step": 824
},
{
"epoch": 0.4125,
"grad_norm": 0.37007105350494385,
"learning_rate": 4.53979093274526e-06,
"loss": 1.6834,
"num_input_tokens_seen": 6920601600,
"step": 825
},
{
"epoch": 0.413,
"grad_norm": 0.3709860146045685,
"learning_rate": 4.490085136555313e-06,
"loss": 1.4491,
"num_input_tokens_seen": 6928990208,
"step": 826
},
{
"epoch": 0.4135,
"grad_norm": 0.3294796645641327,
"learning_rate": 4.440618522439237e-06,
"loss": 1.4501,
"num_input_tokens_seen": 6937378816,
"step": 827
},
{
"epoch": 0.414,
"grad_norm": 0.3185144066810608,
"learning_rate": 4.391391853233404e-06,
"loss": 1.4515,
"num_input_tokens_seen": 6945767424,
"step": 828
},
{
"epoch": 0.4145,
"grad_norm": 0.3276340961456299,
"learning_rate": 4.342405888073971e-06,
"loss": 1.6034,
"num_input_tokens_seen": 6954156032,
"step": 829
},
{
"epoch": 0.415,
"grad_norm": 0.3218885362148285,
"learning_rate": 4.293661382385106e-06,
"loss": 1.4493,
"num_input_tokens_seen": 6962544640,
"step": 830
},
{
"epoch": 0.4155,
"grad_norm": 0.318389356136322,
"learning_rate": 4.245159087867383e-06,
"loss": 1.7035,
"num_input_tokens_seen": 6970933248,
"step": 831
},
{
"epoch": 0.416,
"grad_norm": 0.36420193314552307,
"learning_rate": 4.196899752486192e-06,
"loss": 1.4633,
"num_input_tokens_seen": 6979321856,
"step": 832
},
{
"epoch": 0.4165,
"grad_norm": 0.3152608275413513,
"learning_rate": 4.148884120460186e-06,
"loss": 1.4906,
"num_input_tokens_seen": 6987710464,
"step": 833
},
{
"epoch": 0.417,
"grad_norm": 0.30179423093795776,
"learning_rate": 4.1011129322498e-06,
"loss": 1.6253,
"num_input_tokens_seen": 6996099072,
"step": 834
},
{
"epoch": 0.4175,
"grad_norm": 0.28013914823532104,
"learning_rate": 4.05358692454586e-06,
"loss": 1.5175,
"num_input_tokens_seen": 7004487680,
"step": 835
},
{
"epoch": 0.418,
"grad_norm": 0.344892293214798,
"learning_rate": 4.006306830258189e-06,
"loss": 1.6806,
"num_input_tokens_seen": 7012876288,
"step": 836
},
{
"epoch": 0.4185,
"grad_norm": 0.31076744198799133,
"learning_rate": 3.9592733785043405e-06,
"loss": 1.4867,
"num_input_tokens_seen": 7021264896,
"step": 837
},
{
"epoch": 0.419,
"grad_norm": 0.30278122425079346,
"learning_rate": 3.91248729459831e-06,
"loss": 1.6043,
"num_input_tokens_seen": 7029653504,
"step": 838
},
{
"epoch": 0.4195,
"grad_norm": 1.5924491882324219,
"learning_rate": 3.865949300039404e-06,
"loss": 1.6819,
"num_input_tokens_seen": 7038042112,
"step": 839
},
{
"epoch": 0.42,
"grad_norm": 0.3173651695251465,
"learning_rate": 3.819660112501053e-06,
"loss": 1.5288,
"num_input_tokens_seen": 7046430720,
"step": 840
},
{
"epoch": 0.4205,
"grad_norm": 0.390505850315094,
"learning_rate": 3.773620445819799e-06,
"loss": 1.7401,
"num_input_tokens_seen": 7054819328,
"step": 841
},
{
"epoch": 0.421,
"grad_norm": 0.3141814172267914,
"learning_rate": 3.727831009984262e-06,
"loss": 1.5442,
"num_input_tokens_seen": 7063207936,
"step": 842
},
{
"epoch": 0.4215,
"grad_norm": 0.32728394865989685,
"learning_rate": 3.682292511124179e-06,
"loss": 1.524,
"num_input_tokens_seen": 7071596544,
"step": 843
},
{
"epoch": 0.422,
"grad_norm": 0.30296072363853455,
"learning_rate": 3.637005651499528e-06,
"loss": 1.5514,
"num_input_tokens_seen": 7079985152,
"step": 844
},
{
"epoch": 0.4225,
"grad_norm": 0.3034377098083496,
"learning_rate": 3.5919711294897285e-06,
"loss": 1.6488,
"num_input_tokens_seen": 7088373760,
"step": 845
},
{
"epoch": 0.423,
"grad_norm": 0.3028651773929596,
"learning_rate": 3.5471896395828064e-06,
"loss": 1.6122,
"num_input_tokens_seen": 7096762368,
"step": 846
},
{
"epoch": 0.4235,
"grad_norm": 0.28566136956214905,
"learning_rate": 3.502661872364732e-06,
"loss": 1.6506,
"num_input_tokens_seen": 7105150976,
"step": 847
},
{
"epoch": 0.424,
"grad_norm": 0.32108139991760254,
"learning_rate": 3.4583885145087613e-06,
"loss": 1.4408,
"num_input_tokens_seen": 7113539584,
"step": 848
},
{
"epoch": 0.4245,
"grad_norm": 0.33206671476364136,
"learning_rate": 3.414370248764849e-06,
"loss": 1.4943,
"num_input_tokens_seen": 7121928192,
"step": 849
},
{
"epoch": 0.425,
"grad_norm": 0.27473020553588867,
"learning_rate": 3.3706077539490933e-06,
"loss": 1.5191,
"num_input_tokens_seen": 7130316800,
"step": 850
},
{
"epoch": 0.4255,
"grad_norm": 0.291063517332077,
"learning_rate": 3.327101704933313e-06,
"loss": 1.341,
"num_input_tokens_seen": 7138705408,
"step": 851
},
{
"epoch": 0.426,
"grad_norm": 0.35267290472984314,
"learning_rate": 3.2838527726345994e-06,
"loss": 1.3756,
"num_input_tokens_seen": 7147094016,
"step": 852
},
{
"epoch": 0.4265,
"grad_norm": 0.29957467317581177,
"learning_rate": 3.240861624004983e-06,
"loss": 1.7059,
"num_input_tokens_seen": 7155482624,
"step": 853
},
{
"epoch": 0.427,
"grad_norm": 0.29212555289268494,
"learning_rate": 3.198128922021162e-06,
"loss": 1.4891,
"num_input_tokens_seen": 7163871232,
"step": 854
},
{
"epoch": 0.4275,
"grad_norm": 0.3003202974796295,
"learning_rate": 3.155655325674272e-06,
"loss": 1.5788,
"num_input_tokens_seen": 7172259840,
"step": 855
},
{
"epoch": 0.428,
"grad_norm": 0.29497838020324707,
"learning_rate": 3.1134414899597033e-06,
"loss": 1.6972,
"num_input_tokens_seen": 7180648448,
"step": 856
},
{
"epoch": 0.4285,
"grad_norm": 0.28923463821411133,
"learning_rate": 3.0714880658670165e-06,
"loss": 1.4985,
"num_input_tokens_seen": 7189037056,
"step": 857
},
{
"epoch": 0.429,
"grad_norm": 0.31097179651260376,
"learning_rate": 3.0297957003699284e-06,
"loss": 1.5965,
"num_input_tokens_seen": 7197425664,
"step": 858
},
{
"epoch": 0.4295,
"grad_norm": 0.2652358114719391,
"learning_rate": 2.9883650364162784e-06,
"loss": 1.4394,
"num_input_tokens_seen": 7205814272,
"step": 859
},
{
"epoch": 0.43,
"grad_norm": 0.29938805103302,
"learning_rate": 2.947196712918157e-06,
"loss": 1.6263,
"num_input_tokens_seen": 7214202880,
"step": 860
},
{
"epoch": 0.4305,
"grad_norm": 0.2985369861125946,
"learning_rate": 2.906291364742042e-06,
"loss": 1.5659,
"num_input_tokens_seen": 7222591488,
"step": 861
},
{
"epoch": 0.431,
"grad_norm": 0.31858527660369873,
"learning_rate": 2.8656496226990092e-06,
"loss": 1.6757,
"num_input_tokens_seen": 7230980096,
"step": 862
},
{
"epoch": 0.4315,
"grad_norm": 0.3064689636230469,
"learning_rate": 2.8252721135349892e-06,
"loss": 1.495,
"num_input_tokens_seen": 7239368704,
"step": 863
},
{
"epoch": 0.432,
"grad_norm": 0.2998564839363098,
"learning_rate": 2.7851594599211297e-06,
"loss": 1.3919,
"num_input_tokens_seen": 7247757312,
"step": 864
},
{
"epoch": 0.4325,
"grad_norm": 0.30185946822166443,
"learning_rate": 2.7453122804441636e-06,
"loss": 1.3706,
"num_input_tokens_seen": 7256145920,
"step": 865
},
{
"epoch": 0.433,
"grad_norm": 0.313856303691864,
"learning_rate": 2.705731189596901e-06,
"loss": 1.6292,
"num_input_tokens_seen": 7264534528,
"step": 866
},
{
"epoch": 0.4335,
"grad_norm": 0.32189980149269104,
"learning_rate": 2.6664167977687182e-06,
"loss": 1.7559,
"num_input_tokens_seen": 7272923136,
"step": 867
},
{
"epoch": 0.434,
"grad_norm": 0.3159331977367401,
"learning_rate": 2.6273697112361786e-06,
"loss": 1.6213,
"num_input_tokens_seen": 7281311744,
"step": 868
},
{
"epoch": 0.4345,
"grad_norm": 0.30264052748680115,
"learning_rate": 2.588590532153652e-06,
"loss": 1.5662,
"num_input_tokens_seen": 7289700352,
"step": 869
},
{
"epoch": 0.435,
"grad_norm": 0.29510387778282166,
"learning_rate": 2.550079858544057e-06,
"loss": 1.5406,
"num_input_tokens_seen": 7298088960,
"step": 870
},
{
"epoch": 0.4355,
"grad_norm": 0.3535095155239105,
"learning_rate": 2.511838284289625e-06,
"loss": 1.5925,
"num_input_tokens_seen": 7306477568,
"step": 871
},
{
"epoch": 0.436,
"grad_norm": 0.3253929615020752,
"learning_rate": 2.473866399122733e-06,
"loss": 1.5054,
"num_input_tokens_seen": 7314866176,
"step": 872
},
{
"epoch": 0.4365,
"grad_norm": 0.27736151218414307,
"learning_rate": 2.436164788616815e-06,
"loss": 1.6797,
"num_input_tokens_seen": 7323254784,
"step": 873
},
{
"epoch": 0.437,
"grad_norm": 0.2774750590324402,
"learning_rate": 2.398734034177361e-06,
"loss": 1.3784,
"num_input_tokens_seen": 7331643392,
"step": 874
},
{
"epoch": 0.4375,
"grad_norm": 1.0643863677978516,
"learning_rate": 2.3615747130329013e-06,
"loss": 1.3942,
"num_input_tokens_seen": 7340032000,
"step": 875
},
{
"epoch": 0.438,
"grad_norm": 0.2871946096420288,
"learning_rate": 2.324687398226131e-06,
"loss": 1.5828,
"num_input_tokens_seen": 7348420608,
"step": 876
},
{
"epoch": 0.4385,
"grad_norm": 0.28776809573173523,
"learning_rate": 2.288072658605087e-06,
"loss": 1.4781,
"num_input_tokens_seen": 7356809216,
"step": 877
},
{
"epoch": 0.439,
"grad_norm": 0.31147900223731995,
"learning_rate": 2.2517310588143372e-06,
"loss": 1.6066,
"num_input_tokens_seen": 7365197824,
"step": 878
},
{
"epoch": 0.4395,
"grad_norm": 0.32211750745773315,
"learning_rate": 2.215663159286314e-06,
"loss": 1.6086,
"num_input_tokens_seen": 7373586432,
"step": 879
},
{
"epoch": 0.44,
"grad_norm": 0.360286682844162,
"learning_rate": 2.1798695162326444e-06,
"loss": 1.5362,
"num_input_tokens_seen": 7381975040,
"step": 880
},
{
"epoch": 0.4405,
"grad_norm": 0.2793222665786743,
"learning_rate": 2.144350681635585e-06,
"loss": 1.6441,
"num_input_tokens_seen": 7390363648,
"step": 881
},
{
"epoch": 0.441,
"grad_norm": 0.30778202414512634,
"learning_rate": 2.1091072032395e-06,
"loss": 1.6035,
"num_input_tokens_seen": 7398752256,
"step": 882
},
{
"epoch": 0.4415,
"grad_norm": 0.35812097787857056,
"learning_rate": 2.0741396245424263e-06,
"loss": 1.5975,
"num_input_tokens_seen": 7407140864,
"step": 883
},
{
"epoch": 0.442,
"grad_norm": 0.3686063289642334,
"learning_rate": 2.0394484847876894e-06,
"loss": 1.6201,
"num_input_tokens_seen": 7415529472,
"step": 884
},
{
"epoch": 0.4425,
"grad_norm": 0.3779139220714569,
"learning_rate": 2.0050343189555743e-06,
"loss": 1.6497,
"num_input_tokens_seen": 7423918080,
"step": 885
},
{
"epoch": 0.443,
"grad_norm": 0.298098623752594,
"learning_rate": 1.970897657755084e-06,
"loss": 1.4754,
"num_input_tokens_seen": 7432306688,
"step": 886
},
{
"epoch": 0.4435,
"grad_norm": 0.3022516071796417,
"learning_rate": 1.937039027615779e-06,
"loss": 1.4341,
"num_input_tokens_seen": 7440695296,
"step": 887
},
{
"epoch": 0.444,
"grad_norm": 0.30083125829696655,
"learning_rate": 1.903458950679613e-06,
"loss": 1.6386,
"num_input_tokens_seen": 7449083904,
"step": 888
},
{
"epoch": 0.4445,
"grad_norm": 0.2998676598072052,
"learning_rate": 1.8701579447929076e-06,
"loss": 1.4833,
"num_input_tokens_seen": 7457472512,
"step": 889
},
{
"epoch": 0.445,
"grad_norm": 0.28990113735198975,
"learning_rate": 1.837136523498373e-06,
"loss": 1.5555,
"num_input_tokens_seen": 7465861120,
"step": 890
},
{
"epoch": 0.4455,
"grad_norm": 0.2804090082645416,
"learning_rate": 1.80439519602718e-06,
"loss": 1.5589,
"num_input_tokens_seen": 7474249728,
"step": 891
},
{
"epoch": 0.446,
"grad_norm": 0.29142701625823975,
"learning_rate": 1.7719344672910942e-06,
"loss": 1.5012,
"num_input_tokens_seen": 7482638336,
"step": 892
},
{
"epoch": 0.4465,
"grad_norm": 0.2881058156490326,
"learning_rate": 1.7397548378747142e-06,
"loss": 1.6529,
"num_input_tokens_seen": 7491026944,
"step": 893
},
{
"epoch": 0.447,
"grad_norm": 0.29974132776260376,
"learning_rate": 1.7078568040277276e-06,
"loss": 1.4558,
"num_input_tokens_seen": 7499415552,
"step": 894
},
{
"epoch": 0.4475,
"grad_norm": 0.24919338524341583,
"learning_rate": 1.676240857657283e-06,
"loss": 1.6168,
"num_input_tokens_seen": 7507804160,
"step": 895
},
{
"epoch": 0.448,
"grad_norm": 0.27764326333999634,
"learning_rate": 1.6449074863203773e-06,
"loss": 1.4641,
"num_input_tokens_seen": 7516192768,
"step": 896
},
{
"epoch": 0.4485,
"grad_norm": 0.28568482398986816,
"learning_rate": 1.6138571732163643e-06,
"loss": 1.5211,
"num_input_tokens_seen": 7524581376,
"step": 897
},
{
"epoch": 0.449,
"grad_norm": 0.7067427039146423,
"learning_rate": 1.5830903971794765e-06,
"loss": 1.6592,
"num_input_tokens_seen": 7532969984,
"step": 898
},
{
"epoch": 0.4495,
"grad_norm": 0.2689734399318695,
"learning_rate": 1.5526076326714635e-06,
"loss": 1.4476,
"num_input_tokens_seen": 7541358592,
"step": 899
},
{
"epoch": 0.45,
"grad_norm": 0.27099910378456116,
"learning_rate": 1.5224093497742654e-06,
"loss": 1.4756,
"num_input_tokens_seen": 7549747200,
"step": 900
},
{
"epoch": 0.4505,
"grad_norm": 0.2850476801395416,
"learning_rate": 1.4924960141827605e-06,
"loss": 1.5165,
"num_input_tokens_seen": 7558135808,
"step": 901
},
{
"epoch": 0.451,
"grad_norm": 0.268863707780838,
"learning_rate": 1.4628680871975842e-06,
"loss": 1.5588,
"num_input_tokens_seen": 7566524416,
"step": 902
},
{
"epoch": 0.4515,
"grad_norm": 0.26697346568107605,
"learning_rate": 1.4335260257180262e-06,
"loss": 1.596,
"num_input_tokens_seen": 7574913024,
"step": 903
},
{
"epoch": 0.452,
"grad_norm": 0.28044283390045166,
"learning_rate": 1.4044702822349731e-06,
"loss": 1.4721,
"num_input_tokens_seen": 7583301632,
"step": 904
},
{
"epoch": 0.4525,
"grad_norm": 0.27265068888664246,
"learning_rate": 1.3757013048239287e-06,
"loss": 1.7483,
"num_input_tokens_seen": 7591690240,
"step": 905
},
{
"epoch": 0.453,
"grad_norm": 0.27618253231048584,
"learning_rate": 1.3472195371381202e-06,
"loss": 1.4239,
"num_input_tokens_seen": 7600078848,
"step": 906
},
{
"epoch": 0.4535,
"grad_norm": 0.27610379457473755,
"learning_rate": 1.3190254184016294e-06,
"loss": 1.4987,
"num_input_tokens_seen": 7608467456,
"step": 907
},
{
"epoch": 0.454,
"grad_norm": 0.2572946548461914,
"learning_rate": 1.2911193834026548e-06,
"loss": 1.4957,
"num_input_tokens_seen": 7616856064,
"step": 908
},
{
"epoch": 0.4545,
"grad_norm": 0.2584414482116699,
"learning_rate": 1.2635018624867712e-06,
"loss": 1.6271,
"num_input_tokens_seen": 7625244672,
"step": 909
},
{
"epoch": 0.455,
"grad_norm": 0.26803648471832275,
"learning_rate": 1.236173281550319e-06,
"loss": 1.6387,
"num_input_tokens_seen": 7633633280,
"step": 910
},
{
"epoch": 0.4555,
"grad_norm": 0.25464460253715515,
"learning_rate": 1.209134062033821e-06,
"loss": 1.5111,
"num_input_tokens_seen": 7642021888,
"step": 911
},
{
"epoch": 0.456,
"grad_norm": 0.2775379717350006,
"learning_rate": 1.182384620915491e-06,
"loss": 1.7476,
"num_input_tokens_seen": 7650410496,
"step": 912
},
{
"epoch": 0.4565,
"grad_norm": 0.27452683448791504,
"learning_rate": 1.1559253707048046e-06,
"loss": 1.5443,
"num_input_tokens_seen": 7658799104,
"step": 913
},
{
"epoch": 0.457,
"grad_norm": 0.2475164234638214,
"learning_rate": 1.1297567194361303e-06,
"loss": 1.6505,
"num_input_tokens_seen": 7667187712,
"step": 914
},
{
"epoch": 0.4575,
"grad_norm": 0.28584203124046326,
"learning_rate": 1.103879070662439e-06,
"loss": 1.5918,
"num_input_tokens_seen": 7675576320,
"step": 915
},
{
"epoch": 0.458,
"grad_norm": 0.2768670618534088,
"learning_rate": 1.0782928234490941e-06,
"loss": 1.482,
"num_input_tokens_seen": 7683964928,
"step": 916
},
{
"epoch": 0.4585,
"grad_norm": 0.27717721462249756,
"learning_rate": 1.0529983723676751e-06,
"loss": 1.6142,
"num_input_tokens_seen": 7692353536,
"step": 917
},
{
"epoch": 0.459,
"grad_norm": 0.2514759302139282,
"learning_rate": 1.027996107489908e-06,
"loss": 1.5034,
"num_input_tokens_seen": 7700742144,
"step": 918
},
{
"epoch": 0.4595,
"grad_norm": 0.2566506862640381,
"learning_rate": 1.0032864143816456e-06,
"loss": 1.5485,
"num_input_tokens_seen": 7709130752,
"step": 919
},
{
"epoch": 0.46,
"grad_norm": 0.275288462638855,
"learning_rate": 9.788696740969295e-07,
"loss": 1.5363,
"num_input_tokens_seen": 7717519360,
"step": 920
},
{
"epoch": 0.4605,
"grad_norm": 0.26422953605651855,
"learning_rate": 9.547462631720906e-07,
"loss": 1.7154,
"num_input_tokens_seen": 7725907968,
"step": 921
},
{
"epoch": 0.461,
"grad_norm": 0.2841811180114746,
"learning_rate": 9.30916553619976e-07,
"loss": 1.6462,
"num_input_tokens_seen": 7734296576,
"step": 922
},
{
"epoch": 0.4615,
"grad_norm": 0.2585889995098114,
"learning_rate": 9.073809129241784e-07,
"loss": 1.5431,
"num_input_tokens_seen": 7742685184,
"step": 923
},
{
"epoch": 0.462,
"grad_norm": 0.25092291831970215,
"learning_rate": 8.841397040333976e-07,
"loss": 1.5584,
"num_input_tokens_seen": 7751073792,
"step": 924
},
{
"epoch": 0.4625,
"grad_norm": 0.2754204273223877,
"learning_rate": 8.611932853558236e-07,
"loss": 1.5088,
"num_input_tokens_seen": 7759462400,
"step": 925
},
{
"epoch": 0.463,
"grad_norm": 0.29892218112945557,
"learning_rate": 8.38542010753618e-07,
"loss": 1.5045,
"num_input_tokens_seen": 7767851008,
"step": 926
},
{
"epoch": 0.4635,
"grad_norm": 0.2517067492008209,
"learning_rate": 8.161862295374567e-07,
"loss": 1.4251,
"num_input_tokens_seen": 7776239616,
"step": 927
},
{
"epoch": 0.464,
"grad_norm": 0.28415587544441223,
"learning_rate": 7.941262864611387e-07,
"loss": 1.5208,
"num_input_tokens_seen": 7784628224,
"step": 928
},
{
"epoch": 0.4645,
"grad_norm": 0.2550823986530304,
"learning_rate": 7.723625217162811e-07,
"loss": 1.5787,
"num_input_tokens_seen": 7793016832,
"step": 929
},
{
"epoch": 0.465,
"grad_norm": 0.24390766024589539,
"learning_rate": 7.508952709270567e-07,
"loss": 1.6071,
"num_input_tokens_seen": 7801405440,
"step": 930
},
{
"epoch": 0.4655,
"grad_norm": 0.2771441638469696,
"learning_rate": 7.29724865145025e-07,
"loss": 1.6024,
"num_input_tokens_seen": 7809794048,
"step": 931
},
{
"epoch": 0.466,
"grad_norm": 0.3028735816478729,
"learning_rate": 7.088516308440386e-07,
"loss": 1.6315,
"num_input_tokens_seen": 7818182656,
"step": 932
},
{
"epoch": 0.4665,
"grad_norm": 0.273967981338501,
"learning_rate": 6.882758899151886e-07,
"loss": 1.5286,
"num_input_tokens_seen": 7826571264,
"step": 933
},
{
"epoch": 0.467,
"grad_norm": 0.2625696361064911,
"learning_rate": 6.679979596618546e-07,
"loss": 1.7165,
"num_input_tokens_seen": 7834959872,
"step": 934
},
{
"epoch": 0.4675,
"grad_norm": 0.7894181609153748,
"learning_rate": 6.480181527948049e-07,
"loss": 1.4989,
"num_input_tokens_seen": 7843348480,
"step": 935
},
{
"epoch": 0.468,
"grad_norm": 0.27667322754859924,
"learning_rate": 6.283367774273785e-07,
"loss": 1.6919,
"num_input_tokens_seen": 7851737088,
"step": 936
},
{
"epoch": 0.4685,
"grad_norm": 0.2543598711490631,
"learning_rate": 6.089541370707297e-07,
"loss": 1.6136,
"num_input_tokens_seen": 7860125696,
"step": 937
},
{
"epoch": 0.469,
"grad_norm": 0.2823677361011505,
"learning_rate": 5.898705306291508e-07,
"loss": 1.7088,
"num_input_tokens_seen": 7868514304,
"step": 938
},
{
"epoch": 0.4695,
"grad_norm": 0.2542930543422699,
"learning_rate": 5.71086252395463e-07,
"loss": 1.728,
"num_input_tokens_seen": 7876902912,
"step": 939
},
{
"epoch": 0.47,
"grad_norm": 0.2800678014755249,
"learning_rate": 5.526015920464689e-07,
"loss": 1.5442,
"num_input_tokens_seen": 7885291520,
"step": 940
},
{
"epoch": 0.4705,
"grad_norm": 0.28233593702316284,
"learning_rate": 5.344168346385003e-07,
"loss": 1.5762,
"num_input_tokens_seen": 7893680128,
"step": 941
},
{
"epoch": 0.471,
"grad_norm": 0.2772792875766754,
"learning_rate": 5.165322606030132e-07,
"loss": 1.498,
"num_input_tokens_seen": 7902068736,
"step": 942
},
{
"epoch": 0.4715,
"grad_norm": 0.258087694644928,
"learning_rate": 4.98948145742264e-07,
"loss": 1.6592,
"num_input_tokens_seen": 7910457344,
"step": 943
},
{
"epoch": 0.472,
"grad_norm": 0.2860059142112732,
"learning_rate": 4.816647612250513e-07,
"loss": 1.5144,
"num_input_tokens_seen": 7918845952,
"step": 944
},
{
"epoch": 0.4725,
"grad_norm": 0.24073940515518188,
"learning_rate": 4.646823735825523e-07,
"loss": 1.6956,
"num_input_tokens_seen": 7927234560,
"step": 945
},
{
"epoch": 0.473,
"grad_norm": 0.25412750244140625,
"learning_rate": 4.4800124470418815e-07,
"loss": 1.5523,
"num_input_tokens_seen": 7935623168,
"step": 946
},
{
"epoch": 0.4735,
"grad_norm": 0.2561289966106415,
"learning_rate": 4.3162163183360084e-07,
"loss": 1.5933,
"num_input_tokens_seen": 7944011776,
"step": 947
},
{
"epoch": 0.474,
"grad_norm": 0.2627177834510803,
"learning_rate": 4.155437875646828e-07,
"loss": 1.5529,
"num_input_tokens_seen": 7952400384,
"step": 948
},
{
"epoch": 0.4745,
"grad_norm": 0.2649383544921875,
"learning_rate": 3.997679598376891e-07,
"loss": 1.5151,
"num_input_tokens_seen": 7960788992,
"step": 949
},
{
"epoch": 0.475,
"grad_norm": 0.25103694200515747,
"learning_rate": 3.842943919353914e-07,
"loss": 1.3731,
"num_input_tokens_seen": 7969177600,
"step": 950
},
{
"epoch": 0.4755,
"grad_norm": 0.24840368330478668,
"learning_rate": 3.6912332247935224e-07,
"loss": 1.554,
"num_input_tokens_seen": 7977566208,
"step": 951
},
{
"epoch": 0.476,
"grad_norm": 0.2701607048511505,
"learning_rate": 3.5425498542622784e-07,
"loss": 1.4967,
"num_input_tokens_seen": 7985954816,
"step": 952
},
{
"epoch": 0.4765,
"grad_norm": 0.2613852620124817,
"learning_rate": 3.396896100641689e-07,
"loss": 1.5878,
"num_input_tokens_seen": 7994343424,
"step": 953
},
{
"epoch": 0.477,
"grad_norm": 0.26147374510765076,
"learning_rate": 3.2542742100928114e-07,
"loss": 1.5783,
"num_input_tokens_seen": 8002732032,
"step": 954
},
{
"epoch": 0.4775,
"grad_norm": 0.3100808560848236,
"learning_rate": 3.114686382021681e-07,
"loss": 1.5472,
"num_input_tokens_seen": 8011120640,
"step": 955
},
{
"epoch": 0.478,
"grad_norm": 0.26423409581184387,
"learning_rate": 2.9781347690452266e-07,
"loss": 1.8478,
"num_input_tokens_seen": 8019509248,
"step": 956
},
{
"epoch": 0.4785,
"grad_norm": 0.2365507185459137,
"learning_rate": 2.8446214769582534e-07,
"loss": 1.6472,
"num_input_tokens_seen": 8027897856,
"step": 957
},
{
"epoch": 0.479,
"grad_norm": 0.2785324454307556,
"learning_rate": 2.714148564700914e-07,
"loss": 1.6811,
"num_input_tokens_seen": 8036286464,
"step": 958
},
{
"epoch": 0.4795,
"grad_norm": 0.23417918384075165,
"learning_rate": 2.586718044326886e-07,
"loss": 1.4201,
"num_input_tokens_seen": 8044675072,
"step": 959
},
{
"epoch": 0.48,
"grad_norm": 0.2550223469734192,
"learning_rate": 2.462331880972468e-07,
"loss": 1.4727,
"num_input_tokens_seen": 8053063680,
"step": 960
},
{
"epoch": 0.4805,
"grad_norm": 0.25741317868232727,
"learning_rate": 2.340991992826136e-07,
"loss": 1.5361,
"num_input_tokens_seen": 8061452288,
"step": 961
},
{
"epoch": 0.481,
"grad_norm": 0.2440570592880249,
"learning_rate": 2.222700251099097e-07,
"loss": 1.6914,
"num_input_tokens_seen": 8069840896,
"step": 962
},
{
"epoch": 0.4815,
"grad_norm": 0.4941231906414032,
"learning_rate": 2.107458479996316e-07,
"loss": 1.7785,
"num_input_tokens_seen": 8078229504,
"step": 963
},
{
"epoch": 0.482,
"grad_norm": 0.2466343194246292,
"learning_rate": 1.9952684566884927e-07,
"loss": 1.4094,
"num_input_tokens_seen": 8086618112,
"step": 964
},
{
"epoch": 0.4825,
"grad_norm": 0.2508993148803711,
"learning_rate": 1.88613191128455e-07,
"loss": 1.5626,
"num_input_tokens_seen": 8095006720,
"step": 965
},
{
"epoch": 0.483,
"grad_norm": 0.27574288845062256,
"learning_rate": 1.780050526805055e-07,
"loss": 1.6807,
"num_input_tokens_seen": 8103395328,
"step": 966
},
{
"epoch": 0.4835,
"grad_norm": 0.26518452167510986,
"learning_rate": 1.6770259391561518e-07,
"loss": 1.6383,
"num_input_tokens_seen": 8111783936,
"step": 967
},
{
"epoch": 0.484,
"grad_norm": 0.2572641968727112,
"learning_rate": 1.577059737104447e-07,
"loss": 1.6402,
"num_input_tokens_seen": 8120172544,
"step": 968
},
{
"epoch": 0.4845,
"grad_norm": 0.27201879024505615,
"learning_rate": 1.4801534622524316e-07,
"loss": 1.5114,
"num_input_tokens_seen": 8128561152,
"step": 969
},
{
"epoch": 0.485,
"grad_norm": 0.23598560690879822,
"learning_rate": 1.3863086090147415e-07,
"loss": 1.5498,
"num_input_tokens_seen": 8136949760,
"step": 970
},
{
"epoch": 0.4855,
"grad_norm": 0.2551763653755188,
"learning_rate": 1.2955266245951338e-07,
"loss": 1.5513,
"num_input_tokens_seen": 8145338368,
"step": 971
},
{
"epoch": 0.486,
"grad_norm": 0.2625666558742523,
"learning_rate": 1.2078089089640809e-07,
"loss": 1.6809,
"num_input_tokens_seen": 8153726976,
"step": 972
},
{
"epoch": 0.4865,
"grad_norm": 0.26251545548439026,
"learning_rate": 1.1231568148372562e-07,
"loss": 1.5135,
"num_input_tokens_seen": 8162115584,
"step": 973
},
{
"epoch": 0.487,
"grad_norm": 0.2552337050437927,
"learning_rate": 1.0415716476547045e-07,
"loss": 1.6377,
"num_input_tokens_seen": 8170504192,
"step": 974
},
{
"epoch": 0.4875,
"grad_norm": 0.25301122665405273,
"learning_rate": 9.630546655606365e-08,
"loss": 1.6496,
"num_input_tokens_seen": 8178892800,
"step": 975
},
{
"epoch": 0.488,
"grad_norm": 0.23959580063819885,
"learning_rate": 8.876070793840008e-08,
"loss": 1.4887,
"num_input_tokens_seen": 8187281408,
"step": 976
},
{
"epoch": 0.4885,
"grad_norm": 0.2698057293891907,
"learning_rate": 8.15230052619942e-08,
"loss": 1.5558,
"num_input_tokens_seen": 8195670016,
"step": 977
},
{
"epoch": 0.489,
"grad_norm": 0.2517794370651245,
"learning_rate": 7.459247014117488e-08,
"loss": 1.6568,
"num_input_tokens_seen": 8204058624,
"step": 978
},
{
"epoch": 0.4895,
"grad_norm": 0.32929736375808716,
"learning_rate": 6.796920945336682e-08,
"loss": 1.6056,
"num_input_tokens_seen": 8212447232,
"step": 979
},
{
"epoch": 0.49,
"grad_norm": 0.2451072484254837,
"learning_rate": 6.165332533744072e-08,
"loss": 1.3545,
"num_input_tokens_seen": 8220835840,
"step": 980
},
{
"epoch": 0.4905,
"grad_norm": 0.24773673713207245,
"learning_rate": 5.5644915192145654e-08,
"loss": 1.6526,
"num_input_tokens_seen": 8229224448,
"step": 981
},
{
"epoch": 0.491,
"grad_norm": 0.2363719791173935,
"learning_rate": 4.9944071674599135e-08,
"loss": 1.5398,
"num_input_tokens_seen": 8237613056,
"step": 982
},
{
"epoch": 0.4915,
"grad_norm": 0.24789544939994812,
"learning_rate": 4.4550882698857214e-08,
"loss": 1.5177,
"num_input_tokens_seen": 8246001664,
"step": 983
},
{
"epoch": 0.492,
"grad_norm": 0.24309134483337402,
"learning_rate": 3.946543143456882e-08,
"loss": 1.5559,
"num_input_tokens_seen": 8254390272,
"step": 984
},
{
"epoch": 0.4925,
"grad_norm": 0.2546921968460083,
"learning_rate": 3.468779630568353e-08,
"loss": 1.5036,
"num_input_tokens_seen": 8262778880,
"step": 985
},
{
"epoch": 0.493,
"grad_norm": 0.2471814751625061,
"learning_rate": 3.021805098924136e-08,
"loss": 1.5532,
"num_input_tokens_seen": 8271167488,
"step": 986
},
{
"epoch": 0.4935,
"grad_norm": 0.24231931567192078,
"learning_rate": 2.6056264414249245e-08,
"loss": 1.5185,
"num_input_tokens_seen": 8279556096,
"step": 987
},
{
"epoch": 0.494,
"grad_norm": 0.24174270033836365,
"learning_rate": 2.220250076060193e-08,
"loss": 1.5999,
"num_input_tokens_seen": 8287944704,
"step": 988
},
{
"epoch": 0.4945,
"grad_norm": 0.24908506870269775,
"learning_rate": 1.8656819458100496e-08,
"loss": 1.5319,
"num_input_tokens_seen": 8296333312,
"step": 989
},
{
"epoch": 0.495,
"grad_norm": 0.25362494587898254,
"learning_rate": 1.541927518554198e-08,
"loss": 1.6403,
"num_input_tokens_seen": 8304721920,
"step": 990
},
{
"epoch": 0.4955,
"grad_norm": 0.2601455748081207,
"learning_rate": 1.2489917869860091e-08,
"loss": 1.6017,
"num_input_tokens_seen": 8313110528,
"step": 991
},
{
"epoch": 0.496,
"grad_norm": 0.250872403383255,
"learning_rate": 9.868792685368001e-09,
"loss": 1.5581,
"num_input_tokens_seen": 8321499136,
"step": 992
},
{
"epoch": 0.4965,
"grad_norm": 0.2510140538215637,
"learning_rate": 7.55594005306337e-09,
"loss": 1.5565,
"num_input_tokens_seen": 8329887744,
"step": 993
},
{
"epoch": 0.497,
"grad_norm": 0.2628908157348633,
"learning_rate": 5.551395639988855e-09,
"loss": 1.7027,
"num_input_tokens_seen": 8338276352,
"step": 994
},
{
"epoch": 0.4975,
"grad_norm": 0.24786627292633057,
"learning_rate": 3.855190358703631e-09,
"loss": 1.5539,
"num_input_tokens_seen": 8346664960,
"step": 995
},
{
"epoch": 0.498,
"grad_norm": 0.24326151609420776,
"learning_rate": 2.467350366788246e-09,
"loss": 1.5463,
"num_input_tokens_seen": 8355053568,
"step": 996
},
{
"epoch": 0.4985,
"grad_norm": 0.2904972732067108,
"learning_rate": 1.3878970664538138e-09,
"loss": 1.4752,
"num_input_tokens_seen": 8363442176,
"step": 997
},
{
"epoch": 0.499,
"grad_norm": 0.24948014318943024,
"learning_rate": 6.168471042067303e-10,
"loss": 1.4927,
"num_input_tokens_seen": 8371830784,
"step": 998
},
{
"epoch": 0.4995,
"grad_norm": 0.24558107554912567,
"learning_rate": 1.5421237058887984e-10,
"loss": 1.5189,
"num_input_tokens_seen": 8380219392,
"step": 999
},
{
"epoch": 0.5,
"grad_norm": 0.2598268687725067,
"learning_rate": 0.0,
"loss": 1.5883,
"num_input_tokens_seen": 8388608000,
"step": 1000
}
],
"logging_steps": 1.0,
"max_steps": 1000,
"num_input_tokens_seen": 8388608000,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.902112919650304e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}