train_svamp_1757340245 / trainer_state.json
rbelanec's picture
End of training
3beb98f verified
{
"best_global_step": 1260,
"best_metric": 0.10533556342124939,
"best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_svamp_1757340245/checkpoint-1260",
"epoch": 20.0,
"eval_steps": 315,
"global_step": 6300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015873015873015872,
"grad_norm": 89.08386993408203,
"learning_rate": 3.174603174603175e-07,
"loss": 5.7094,
"num_input_tokens_seen": 1056,
"step": 5
},
{
"epoch": 0.031746031746031744,
"grad_norm": 83.91914367675781,
"learning_rate": 7.142857142857143e-07,
"loss": 5.3977,
"num_input_tokens_seen": 2112,
"step": 10
},
{
"epoch": 0.047619047619047616,
"grad_norm": 61.71086883544922,
"learning_rate": 1.1111111111111112e-06,
"loss": 5.1387,
"num_input_tokens_seen": 3152,
"step": 15
},
{
"epoch": 0.06349206349206349,
"grad_norm": 53.70893096923828,
"learning_rate": 1.507936507936508e-06,
"loss": 4.76,
"num_input_tokens_seen": 4272,
"step": 20
},
{
"epoch": 0.07936507936507936,
"grad_norm": 62.33582305908203,
"learning_rate": 1.9047619047619051e-06,
"loss": 4.1702,
"num_input_tokens_seen": 5296,
"step": 25
},
{
"epoch": 0.09523809523809523,
"grad_norm": 39.57314682006836,
"learning_rate": 2.301587301587302e-06,
"loss": 3.7174,
"num_input_tokens_seen": 6352,
"step": 30
},
{
"epoch": 0.1111111111111111,
"grad_norm": 44.72786331176758,
"learning_rate": 2.6984126984126986e-06,
"loss": 3.4702,
"num_input_tokens_seen": 7472,
"step": 35
},
{
"epoch": 0.12698412698412698,
"grad_norm": 44.22084426879883,
"learning_rate": 3.0952380952380953e-06,
"loss": 3.1998,
"num_input_tokens_seen": 8544,
"step": 40
},
{
"epoch": 0.14285714285714285,
"grad_norm": 40.643310546875,
"learning_rate": 3.4920634920634924e-06,
"loss": 2.862,
"num_input_tokens_seen": 9648,
"step": 45
},
{
"epoch": 0.15873015873015872,
"grad_norm": 47.437110900878906,
"learning_rate": 3.888888888888889e-06,
"loss": 2.2716,
"num_input_tokens_seen": 10720,
"step": 50
},
{
"epoch": 0.1746031746031746,
"grad_norm": 50.03641128540039,
"learning_rate": 4.285714285714286e-06,
"loss": 2.1866,
"num_input_tokens_seen": 11792,
"step": 55
},
{
"epoch": 0.19047619047619047,
"grad_norm": 36.45273971557617,
"learning_rate": 4.682539682539683e-06,
"loss": 1.8074,
"num_input_tokens_seen": 12864,
"step": 60
},
{
"epoch": 0.20634920634920634,
"grad_norm": 47.23276901245117,
"learning_rate": 5.07936507936508e-06,
"loss": 1.5586,
"num_input_tokens_seen": 13920,
"step": 65
},
{
"epoch": 0.2222222222222222,
"grad_norm": 25.937341690063477,
"learning_rate": 5.4761904761904765e-06,
"loss": 1.4778,
"num_input_tokens_seen": 15072,
"step": 70
},
{
"epoch": 0.23809523809523808,
"grad_norm": 39.08681106567383,
"learning_rate": 5.873015873015873e-06,
"loss": 1.2204,
"num_input_tokens_seen": 16064,
"step": 75
},
{
"epoch": 0.25396825396825395,
"grad_norm": 44.03461837768555,
"learning_rate": 6.26984126984127e-06,
"loss": 1.0868,
"num_input_tokens_seen": 17216,
"step": 80
},
{
"epoch": 0.2698412698412698,
"grad_norm": 34.26078796386719,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0351,
"num_input_tokens_seen": 18272,
"step": 85
},
{
"epoch": 0.2857142857142857,
"grad_norm": 49.58864974975586,
"learning_rate": 7.063492063492063e-06,
"loss": 0.8208,
"num_input_tokens_seen": 19312,
"step": 90
},
{
"epoch": 0.30158730158730157,
"grad_norm": 44.86882400512695,
"learning_rate": 7.460317460317461e-06,
"loss": 0.8872,
"num_input_tokens_seen": 20384,
"step": 95
},
{
"epoch": 0.31746031746031744,
"grad_norm": 37.76105880737305,
"learning_rate": 7.857142857142858e-06,
"loss": 0.92,
"num_input_tokens_seen": 21424,
"step": 100
},
{
"epoch": 0.3333333333333333,
"grad_norm": 28.7552490234375,
"learning_rate": 8.253968253968254e-06,
"loss": 0.7919,
"num_input_tokens_seen": 22480,
"step": 105
},
{
"epoch": 0.3492063492063492,
"grad_norm": 47.71187973022461,
"learning_rate": 8.650793650793651e-06,
"loss": 0.8995,
"num_input_tokens_seen": 23568,
"step": 110
},
{
"epoch": 0.36507936507936506,
"grad_norm": 44.39586639404297,
"learning_rate": 9.047619047619047e-06,
"loss": 0.714,
"num_input_tokens_seen": 24672,
"step": 115
},
{
"epoch": 0.38095238095238093,
"grad_norm": 26.836511611938477,
"learning_rate": 9.444444444444445e-06,
"loss": 0.6963,
"num_input_tokens_seen": 25776,
"step": 120
},
{
"epoch": 0.3968253968253968,
"grad_norm": 32.43899917602539,
"learning_rate": 9.841269841269842e-06,
"loss": 0.8225,
"num_input_tokens_seen": 26912,
"step": 125
},
{
"epoch": 0.4126984126984127,
"grad_norm": 29.625259399414062,
"learning_rate": 1.0238095238095238e-05,
"loss": 0.7967,
"num_input_tokens_seen": 27936,
"step": 130
},
{
"epoch": 0.42857142857142855,
"grad_norm": 51.56465530395508,
"learning_rate": 1.0634920634920636e-05,
"loss": 0.6416,
"num_input_tokens_seen": 29024,
"step": 135
},
{
"epoch": 0.4444444444444444,
"grad_norm": 36.9901123046875,
"learning_rate": 1.1031746031746031e-05,
"loss": 0.6575,
"num_input_tokens_seen": 30128,
"step": 140
},
{
"epoch": 0.4603174603174603,
"grad_norm": 32.11468505859375,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.8359,
"num_input_tokens_seen": 31168,
"step": 145
},
{
"epoch": 0.47619047619047616,
"grad_norm": 17.278911590576172,
"learning_rate": 1.1825396825396825e-05,
"loss": 0.6762,
"num_input_tokens_seen": 32256,
"step": 150
},
{
"epoch": 0.49206349206349204,
"grad_norm": 28.327159881591797,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.5471,
"num_input_tokens_seen": 33296,
"step": 155
},
{
"epoch": 0.5079365079365079,
"grad_norm": 22.584383010864258,
"learning_rate": 1.261904761904762e-05,
"loss": 0.5509,
"num_input_tokens_seen": 34368,
"step": 160
},
{
"epoch": 0.5238095238095238,
"grad_norm": 33.20157241821289,
"learning_rate": 1.3015873015873018e-05,
"loss": 1.0047,
"num_input_tokens_seen": 35504,
"step": 165
},
{
"epoch": 0.5396825396825397,
"grad_norm": 29.51561737060547,
"learning_rate": 1.3412698412698413e-05,
"loss": 0.6524,
"num_input_tokens_seen": 36512,
"step": 170
},
{
"epoch": 0.5555555555555556,
"grad_norm": 21.06121826171875,
"learning_rate": 1.3809523809523811e-05,
"loss": 0.5967,
"num_input_tokens_seen": 37600,
"step": 175
},
{
"epoch": 0.5714285714285714,
"grad_norm": 22.211185455322266,
"learning_rate": 1.4206349206349207e-05,
"loss": 0.5267,
"num_input_tokens_seen": 38656,
"step": 180
},
{
"epoch": 0.5873015873015873,
"grad_norm": 23.984838485717773,
"learning_rate": 1.4603174603174605e-05,
"loss": 0.4941,
"num_input_tokens_seen": 39712,
"step": 185
},
{
"epoch": 0.6031746031746031,
"grad_norm": 31.058792114257812,
"learning_rate": 1.5e-05,
"loss": 0.7086,
"num_input_tokens_seen": 40784,
"step": 190
},
{
"epoch": 0.6190476190476191,
"grad_norm": 21.773157119750977,
"learning_rate": 1.5396825396825398e-05,
"loss": 0.6814,
"num_input_tokens_seen": 41840,
"step": 195
},
{
"epoch": 0.6349206349206349,
"grad_norm": 13.91670036315918,
"learning_rate": 1.5793650793650794e-05,
"loss": 0.6264,
"num_input_tokens_seen": 42880,
"step": 200
},
{
"epoch": 0.6507936507936508,
"grad_norm": 24.21043586730957,
"learning_rate": 1.6190476190476193e-05,
"loss": 0.7654,
"num_input_tokens_seen": 44000,
"step": 205
},
{
"epoch": 0.6666666666666666,
"grad_norm": 23.567092895507812,
"learning_rate": 1.658730158730159e-05,
"loss": 0.6436,
"num_input_tokens_seen": 45008,
"step": 210
},
{
"epoch": 0.6825396825396826,
"grad_norm": 30.81178092956543,
"learning_rate": 1.6984126984126985e-05,
"loss": 0.7867,
"num_input_tokens_seen": 46064,
"step": 215
},
{
"epoch": 0.6984126984126984,
"grad_norm": 21.90506935119629,
"learning_rate": 1.738095238095238e-05,
"loss": 0.7717,
"num_input_tokens_seen": 47152,
"step": 220
},
{
"epoch": 0.7142857142857143,
"grad_norm": 39.183982849121094,
"learning_rate": 1.777777777777778e-05,
"loss": 0.9016,
"num_input_tokens_seen": 48176,
"step": 225
},
{
"epoch": 0.7301587301587301,
"grad_norm": 12.529756546020508,
"learning_rate": 1.8174603174603176e-05,
"loss": 0.591,
"num_input_tokens_seen": 49232,
"step": 230
},
{
"epoch": 0.746031746031746,
"grad_norm": 16.73536491394043,
"learning_rate": 1.8571428571428572e-05,
"loss": 0.7902,
"num_input_tokens_seen": 50288,
"step": 235
},
{
"epoch": 0.7619047619047619,
"grad_norm": 12.837594985961914,
"learning_rate": 1.8968253968253968e-05,
"loss": 0.706,
"num_input_tokens_seen": 51376,
"step": 240
},
{
"epoch": 0.7777777777777778,
"grad_norm": 31.631135940551758,
"learning_rate": 1.9365079365079367e-05,
"loss": 0.7042,
"num_input_tokens_seen": 52448,
"step": 245
},
{
"epoch": 0.7936507936507936,
"grad_norm": 10.814046859741211,
"learning_rate": 1.9761904761904763e-05,
"loss": 0.6963,
"num_input_tokens_seen": 53520,
"step": 250
},
{
"epoch": 0.8095238095238095,
"grad_norm": 16.586753845214844,
"learning_rate": 2.015873015873016e-05,
"loss": 1.092,
"num_input_tokens_seen": 54608,
"step": 255
},
{
"epoch": 0.8253968253968254,
"grad_norm": 7.401493549346924,
"learning_rate": 2.0555555555555555e-05,
"loss": 0.582,
"num_input_tokens_seen": 55664,
"step": 260
},
{
"epoch": 0.8412698412698413,
"grad_norm": 8.16193962097168,
"learning_rate": 2.0952380952380954e-05,
"loss": 0.6384,
"num_input_tokens_seen": 56832,
"step": 265
},
{
"epoch": 0.8571428571428571,
"grad_norm": 14.588497161865234,
"learning_rate": 2.134920634920635e-05,
"loss": 0.5567,
"num_input_tokens_seen": 57968,
"step": 270
},
{
"epoch": 0.873015873015873,
"grad_norm": 13.264519691467285,
"learning_rate": 2.174603174603175e-05,
"loss": 0.603,
"num_input_tokens_seen": 59072,
"step": 275
},
{
"epoch": 0.8888888888888888,
"grad_norm": 19.701818466186523,
"learning_rate": 2.214285714285714e-05,
"loss": 0.7061,
"num_input_tokens_seen": 60112,
"step": 280
},
{
"epoch": 0.9047619047619048,
"grad_norm": 15.928413391113281,
"learning_rate": 2.253968253968254e-05,
"loss": 0.5645,
"num_input_tokens_seen": 61184,
"step": 285
},
{
"epoch": 0.9206349206349206,
"grad_norm": 20.14051055908203,
"learning_rate": 2.2936507936507937e-05,
"loss": 0.6211,
"num_input_tokens_seen": 62240,
"step": 290
},
{
"epoch": 0.9365079365079365,
"grad_norm": 28.218061447143555,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.6783,
"num_input_tokens_seen": 63312,
"step": 295
},
{
"epoch": 0.9523809523809523,
"grad_norm": 15.398136138916016,
"learning_rate": 2.373015873015873e-05,
"loss": 0.6398,
"num_input_tokens_seen": 64384,
"step": 300
},
{
"epoch": 0.9682539682539683,
"grad_norm": 6.9146246910095215,
"learning_rate": 2.4126984126984128e-05,
"loss": 0.6358,
"num_input_tokens_seen": 65392,
"step": 305
},
{
"epoch": 0.9841269841269841,
"grad_norm": 6.8382062911987305,
"learning_rate": 2.4523809523809523e-05,
"loss": 0.5311,
"num_input_tokens_seen": 66448,
"step": 310
},
{
"epoch": 1.0,
"grad_norm": 10.719470024108887,
"learning_rate": 2.4920634920634923e-05,
"loss": 0.6612,
"num_input_tokens_seen": 67504,
"step": 315
},
{
"epoch": 1.0,
"eval_loss": 0.6388102769851685,
"eval_runtime": 1.455,
"eval_samples_per_second": 48.109,
"eval_steps_per_second": 24.054,
"num_input_tokens_seen": 67504,
"step": 315
},
{
"epoch": 1.0158730158730158,
"grad_norm": 11.29240608215332,
"learning_rate": 2.531746031746032e-05,
"loss": 0.6203,
"num_input_tokens_seen": 68624,
"step": 320
},
{
"epoch": 1.0317460317460316,
"grad_norm": 10.736639022827148,
"learning_rate": 2.5714285714285714e-05,
"loss": 0.5772,
"num_input_tokens_seen": 69632,
"step": 325
},
{
"epoch": 1.0476190476190477,
"grad_norm": 14.154836654663086,
"learning_rate": 2.6111111111111114e-05,
"loss": 0.5529,
"num_input_tokens_seen": 70736,
"step": 330
},
{
"epoch": 1.0634920634920635,
"grad_norm": 10.680912971496582,
"learning_rate": 2.650793650793651e-05,
"loss": 0.3824,
"num_input_tokens_seen": 71760,
"step": 335
},
{
"epoch": 1.0793650793650793,
"grad_norm": 41.33735656738281,
"learning_rate": 2.6904761904761905e-05,
"loss": 0.9206,
"num_input_tokens_seen": 72880,
"step": 340
},
{
"epoch": 1.0952380952380953,
"grad_norm": 7.261141777038574,
"learning_rate": 2.73015873015873e-05,
"loss": 0.6019,
"num_input_tokens_seen": 73952,
"step": 345
},
{
"epoch": 1.1111111111111112,
"grad_norm": 6.877208709716797,
"learning_rate": 2.76984126984127e-05,
"loss": 0.692,
"num_input_tokens_seen": 75072,
"step": 350
},
{
"epoch": 1.126984126984127,
"grad_norm": 6.756972312927246,
"learning_rate": 2.8095238095238096e-05,
"loss": 0.7479,
"num_input_tokens_seen": 76144,
"step": 355
},
{
"epoch": 1.1428571428571428,
"grad_norm": 11.503968238830566,
"learning_rate": 2.8492063492063492e-05,
"loss": 0.6113,
"num_input_tokens_seen": 77248,
"step": 360
},
{
"epoch": 1.1587301587301586,
"grad_norm": 7.765186309814453,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.5137,
"num_input_tokens_seen": 78336,
"step": 365
},
{
"epoch": 1.1746031746031746,
"grad_norm": 15.50655460357666,
"learning_rate": 2.9285714285714288e-05,
"loss": 0.5522,
"num_input_tokens_seen": 79392,
"step": 370
},
{
"epoch": 1.1904761904761905,
"grad_norm": 4.964603900909424,
"learning_rate": 2.9682539682539683e-05,
"loss": 0.4755,
"num_input_tokens_seen": 80480,
"step": 375
},
{
"epoch": 1.2063492063492063,
"grad_norm": 11.66644287109375,
"learning_rate": 3.007936507936508e-05,
"loss": 0.9369,
"num_input_tokens_seen": 81568,
"step": 380
},
{
"epoch": 1.2222222222222223,
"grad_norm": 5.162360191345215,
"learning_rate": 3.0476190476190482e-05,
"loss": 0.3816,
"num_input_tokens_seen": 82608,
"step": 385
},
{
"epoch": 1.2380952380952381,
"grad_norm": 13.151936531066895,
"learning_rate": 3.0873015873015874e-05,
"loss": 0.6825,
"num_input_tokens_seen": 83680,
"step": 390
},
{
"epoch": 1.253968253968254,
"grad_norm": 10.89046573638916,
"learning_rate": 3.1269841269841274e-05,
"loss": 0.5983,
"num_input_tokens_seen": 84784,
"step": 395
},
{
"epoch": 1.2698412698412698,
"grad_norm": 11.605130195617676,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.527,
"num_input_tokens_seen": 85824,
"step": 400
},
{
"epoch": 1.2857142857142856,
"grad_norm": 7.945504665374756,
"learning_rate": 3.2063492063492065e-05,
"loss": 0.5015,
"num_input_tokens_seen": 86928,
"step": 405
},
{
"epoch": 1.3015873015873016,
"grad_norm": 11.201025009155273,
"learning_rate": 3.2460317460317465e-05,
"loss": 0.6972,
"num_input_tokens_seen": 88032,
"step": 410
},
{
"epoch": 1.3174603174603174,
"grad_norm": 4.484563827514648,
"learning_rate": 3.285714285714286e-05,
"loss": 0.424,
"num_input_tokens_seen": 89088,
"step": 415
},
{
"epoch": 1.3333333333333333,
"grad_norm": 11.90958023071289,
"learning_rate": 3.3253968253968256e-05,
"loss": 0.4407,
"num_input_tokens_seen": 90128,
"step": 420
},
{
"epoch": 1.3492063492063493,
"grad_norm": 9.70373249053955,
"learning_rate": 3.3650793650793656e-05,
"loss": 0.5097,
"num_input_tokens_seen": 91184,
"step": 425
},
{
"epoch": 1.3650793650793651,
"grad_norm": 8.781391143798828,
"learning_rate": 3.404761904761905e-05,
"loss": 0.6233,
"num_input_tokens_seen": 92256,
"step": 430
},
{
"epoch": 1.380952380952381,
"grad_norm": 5.017049312591553,
"learning_rate": 3.444444444444445e-05,
"loss": 0.4957,
"num_input_tokens_seen": 93312,
"step": 435
},
{
"epoch": 1.3968253968253967,
"grad_norm": 7.375868320465088,
"learning_rate": 3.484126984126984e-05,
"loss": 0.5737,
"num_input_tokens_seen": 94384,
"step": 440
},
{
"epoch": 1.4126984126984126,
"grad_norm": 6.288750648498535,
"learning_rate": 3.523809523809524e-05,
"loss": 0.7232,
"num_input_tokens_seen": 95504,
"step": 445
},
{
"epoch": 1.4285714285714286,
"grad_norm": 4.126834869384766,
"learning_rate": 3.563492063492064e-05,
"loss": 0.5578,
"num_input_tokens_seen": 96608,
"step": 450
},
{
"epoch": 1.4444444444444444,
"grad_norm": 3.2642154693603516,
"learning_rate": 3.603174603174603e-05,
"loss": 0.4916,
"num_input_tokens_seen": 97632,
"step": 455
},
{
"epoch": 1.4603174603174602,
"grad_norm": 9.297268867492676,
"learning_rate": 3.642857142857143e-05,
"loss": 0.5753,
"num_input_tokens_seen": 98752,
"step": 460
},
{
"epoch": 1.4761904761904763,
"grad_norm": 11.799019813537598,
"learning_rate": 3.682539682539683e-05,
"loss": 0.974,
"num_input_tokens_seen": 99904,
"step": 465
},
{
"epoch": 1.492063492063492,
"grad_norm": 4.892351150512695,
"learning_rate": 3.722222222222222e-05,
"loss": 0.5816,
"num_input_tokens_seen": 100960,
"step": 470
},
{
"epoch": 1.507936507936508,
"grad_norm": 3.8664710521698,
"learning_rate": 3.761904761904762e-05,
"loss": 0.4051,
"num_input_tokens_seen": 102016,
"step": 475
},
{
"epoch": 1.5238095238095237,
"grad_norm": 3.5097477436065674,
"learning_rate": 3.8015873015873014e-05,
"loss": 0.4769,
"num_input_tokens_seen": 103104,
"step": 480
},
{
"epoch": 1.5396825396825395,
"grad_norm": 5.085179328918457,
"learning_rate": 3.841269841269842e-05,
"loss": 0.4481,
"num_input_tokens_seen": 104144,
"step": 485
},
{
"epoch": 1.5555555555555556,
"grad_norm": 9.254948616027832,
"learning_rate": 3.880952380952381e-05,
"loss": 0.7627,
"num_input_tokens_seen": 105152,
"step": 490
},
{
"epoch": 1.5714285714285714,
"grad_norm": 5.68356466293335,
"learning_rate": 3.9206349206349205e-05,
"loss": 0.583,
"num_input_tokens_seen": 106192,
"step": 495
},
{
"epoch": 1.5873015873015874,
"grad_norm": 3.4472599029541016,
"learning_rate": 3.9603174603174604e-05,
"loss": 0.4961,
"num_input_tokens_seen": 107216,
"step": 500
},
{
"epoch": 1.6031746031746033,
"grad_norm": 4.6199212074279785,
"learning_rate": 4e-05,
"loss": 0.4046,
"num_input_tokens_seen": 108368,
"step": 505
},
{
"epoch": 1.619047619047619,
"grad_norm": 22.77120590209961,
"learning_rate": 4.03968253968254e-05,
"loss": 0.6756,
"num_input_tokens_seen": 109440,
"step": 510
},
{
"epoch": 1.6349206349206349,
"grad_norm": 5.753880500793457,
"learning_rate": 4.0793650793650795e-05,
"loss": 0.5926,
"num_input_tokens_seen": 110480,
"step": 515
},
{
"epoch": 1.6507936507936507,
"grad_norm": 4.747334003448486,
"learning_rate": 4.119047619047619e-05,
"loss": 0.4798,
"num_input_tokens_seen": 111488,
"step": 520
},
{
"epoch": 1.6666666666666665,
"grad_norm": 4.838809967041016,
"learning_rate": 4.1587301587301594e-05,
"loss": 0.5422,
"num_input_tokens_seen": 112528,
"step": 525
},
{
"epoch": 1.6825396825396826,
"grad_norm": 6.677746295928955,
"learning_rate": 4.1984126984126986e-05,
"loss": 0.5234,
"num_input_tokens_seen": 113600,
"step": 530
},
{
"epoch": 1.6984126984126984,
"grad_norm": 9.256402015686035,
"learning_rate": 4.2380952380952385e-05,
"loss": 0.5388,
"num_input_tokens_seen": 114720,
"step": 535
},
{
"epoch": 1.7142857142857144,
"grad_norm": 6.039902687072754,
"learning_rate": 4.277777777777778e-05,
"loss": 0.4843,
"num_input_tokens_seen": 115744,
"step": 540
},
{
"epoch": 1.7301587301587302,
"grad_norm": 14.398706436157227,
"learning_rate": 4.317460317460318e-05,
"loss": 0.3626,
"num_input_tokens_seen": 116880,
"step": 545
},
{
"epoch": 1.746031746031746,
"grad_norm": 5.464714050292969,
"learning_rate": 4.3571428571428576e-05,
"loss": 0.6535,
"num_input_tokens_seen": 117984,
"step": 550
},
{
"epoch": 1.7619047619047619,
"grad_norm": 4.07396125793457,
"learning_rate": 4.396825396825397e-05,
"loss": 0.37,
"num_input_tokens_seen": 119008,
"step": 555
},
{
"epoch": 1.7777777777777777,
"grad_norm": 9.14253044128418,
"learning_rate": 4.436507936507937e-05,
"loss": 0.6951,
"num_input_tokens_seen": 120080,
"step": 560
},
{
"epoch": 1.7936507936507935,
"grad_norm": 3.3419647216796875,
"learning_rate": 4.476190476190477e-05,
"loss": 0.238,
"num_input_tokens_seen": 121136,
"step": 565
},
{
"epoch": 1.8095238095238095,
"grad_norm": 6.380158424377441,
"learning_rate": 4.515873015873016e-05,
"loss": 0.2852,
"num_input_tokens_seen": 122144,
"step": 570
},
{
"epoch": 1.8253968253968254,
"grad_norm": 5.351354122161865,
"learning_rate": 4.555555555555556e-05,
"loss": 0.3958,
"num_input_tokens_seen": 123264,
"step": 575
},
{
"epoch": 1.8412698412698414,
"grad_norm": 2.423280954360962,
"learning_rate": 4.595238095238095e-05,
"loss": 0.2796,
"num_input_tokens_seen": 124304,
"step": 580
},
{
"epoch": 1.8571428571428572,
"grad_norm": 3.0386385917663574,
"learning_rate": 4.634920634920635e-05,
"loss": 0.2782,
"num_input_tokens_seen": 125408,
"step": 585
},
{
"epoch": 1.873015873015873,
"grad_norm": 4.597468376159668,
"learning_rate": 4.674603174603175e-05,
"loss": 0.3238,
"num_input_tokens_seen": 126464,
"step": 590
},
{
"epoch": 1.8888888888888888,
"grad_norm": 1.0736624002456665,
"learning_rate": 4.714285714285714e-05,
"loss": 0.2223,
"num_input_tokens_seen": 127600,
"step": 595
},
{
"epoch": 1.9047619047619047,
"grad_norm": 4.550055503845215,
"learning_rate": 4.753968253968254e-05,
"loss": 0.2229,
"num_input_tokens_seen": 128656,
"step": 600
},
{
"epoch": 1.9206349206349205,
"grad_norm": 2.1782174110412598,
"learning_rate": 4.793650793650794e-05,
"loss": 0.2182,
"num_input_tokens_seen": 129712,
"step": 605
},
{
"epoch": 1.9365079365079365,
"grad_norm": 7.886104583740234,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.4634,
"num_input_tokens_seen": 130720,
"step": 610
},
{
"epoch": 1.9523809523809523,
"grad_norm": 1.8265987634658813,
"learning_rate": 4.873015873015873e-05,
"loss": 0.3124,
"num_input_tokens_seen": 131792,
"step": 615
},
{
"epoch": 1.9682539682539684,
"grad_norm": 1.8508260250091553,
"learning_rate": 4.9126984126984125e-05,
"loss": 0.2745,
"num_input_tokens_seen": 132896,
"step": 620
},
{
"epoch": 1.9841269841269842,
"grad_norm": 2.103785753250122,
"learning_rate": 4.9523809523809525e-05,
"loss": 0.2197,
"num_input_tokens_seen": 133920,
"step": 625
},
{
"epoch": 2.0,
"grad_norm": 0.9688537120819092,
"learning_rate": 4.9920634920634924e-05,
"loss": 0.2233,
"num_input_tokens_seen": 135040,
"step": 630
},
{
"epoch": 2.0,
"eval_loss": 0.31226205825805664,
"eval_runtime": 1.4549,
"eval_samples_per_second": 48.114,
"eval_steps_per_second": 24.057,
"num_input_tokens_seen": 135040,
"step": 630
},
{
"epoch": 2.015873015873016,
"grad_norm": 1.8913108110427856,
"learning_rate": 4.9999938600696385e-05,
"loss": 0.2113,
"num_input_tokens_seen": 136096,
"step": 635
},
{
"epoch": 2.0317460317460316,
"grad_norm": 1.2923585176467896,
"learning_rate": 4.9999689166542295e-05,
"loss": 0.2567,
"num_input_tokens_seen": 137120,
"step": 640
},
{
"epoch": 2.0476190476190474,
"grad_norm": 1.235941767692566,
"learning_rate": 4.9999247861994194e-05,
"loss": 0.17,
"num_input_tokens_seen": 138176,
"step": 645
},
{
"epoch": 2.0634920634920633,
"grad_norm": 2.3454935550689697,
"learning_rate": 4.9998614690439037e-05,
"loss": 0.1193,
"num_input_tokens_seen": 139200,
"step": 650
},
{
"epoch": 2.0793650793650795,
"grad_norm": 0.9585188031196594,
"learning_rate": 4.9997789656736365e-05,
"loss": 0.2354,
"num_input_tokens_seen": 140288,
"step": 655
},
{
"epoch": 2.0952380952380953,
"grad_norm": 9.852370262145996,
"learning_rate": 4.9996772767218244e-05,
"loss": 0.3074,
"num_input_tokens_seen": 141360,
"step": 660
},
{
"epoch": 2.111111111111111,
"grad_norm": 1.8546109199523926,
"learning_rate": 4.9995564029689204e-05,
"loss": 0.0918,
"num_input_tokens_seen": 142416,
"step": 665
},
{
"epoch": 2.126984126984127,
"grad_norm": 5.140466690063477,
"learning_rate": 4.999416345342619e-05,
"loss": 0.159,
"num_input_tokens_seen": 143552,
"step": 670
},
{
"epoch": 2.142857142857143,
"grad_norm": 4.833157062530518,
"learning_rate": 4.9992571049178516e-05,
"loss": 0.1179,
"num_input_tokens_seen": 144592,
"step": 675
},
{
"epoch": 2.1587301587301586,
"grad_norm": 3.79502272605896,
"learning_rate": 4.999078682916774e-05,
"loss": 0.1536,
"num_input_tokens_seen": 145696,
"step": 680
},
{
"epoch": 2.1746031746031744,
"grad_norm": 12.648764610290527,
"learning_rate": 4.9988810807087584e-05,
"loss": 0.2089,
"num_input_tokens_seen": 146784,
"step": 685
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.6488538980484009,
"learning_rate": 4.998664299810385e-05,
"loss": 0.1116,
"num_input_tokens_seen": 147840,
"step": 690
},
{
"epoch": 2.2063492063492065,
"grad_norm": 3.367002487182617,
"learning_rate": 4.9984283418854284e-05,
"loss": 0.0847,
"num_input_tokens_seen": 148912,
"step": 695
},
{
"epoch": 2.2222222222222223,
"grad_norm": 2.9967145919799805,
"learning_rate": 4.998173208744843e-05,
"loss": 0.1363,
"num_input_tokens_seen": 149872,
"step": 700
},
{
"epoch": 2.238095238095238,
"grad_norm": 0.24820205569267273,
"learning_rate": 4.9978989023467536e-05,
"loss": 0.1622,
"num_input_tokens_seen": 150976,
"step": 705
},
{
"epoch": 2.253968253968254,
"grad_norm": 2.1588690280914307,
"learning_rate": 4.997605424796439e-05,
"loss": 0.0558,
"num_input_tokens_seen": 152032,
"step": 710
},
{
"epoch": 2.2698412698412698,
"grad_norm": 8.513988494873047,
"learning_rate": 4.997292778346312e-05,
"loss": 0.1075,
"num_input_tokens_seen": 153072,
"step": 715
},
{
"epoch": 2.2857142857142856,
"grad_norm": 5.093414306640625,
"learning_rate": 4.996960965395906e-05,
"loss": 0.2232,
"num_input_tokens_seen": 154080,
"step": 720
},
{
"epoch": 2.3015873015873014,
"grad_norm": 0.1848161220550537,
"learning_rate": 4.996609988491856e-05,
"loss": 0.0631,
"num_input_tokens_seen": 155168,
"step": 725
},
{
"epoch": 2.317460317460317,
"grad_norm": 1.1659319400787354,
"learning_rate": 4.99623985032788e-05,
"loss": 0.1902,
"num_input_tokens_seen": 156192,
"step": 730
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.7289673686027527,
"learning_rate": 4.9958505537447535e-05,
"loss": 0.0223,
"num_input_tokens_seen": 157184,
"step": 735
},
{
"epoch": 2.3492063492063493,
"grad_norm": 5.103140354156494,
"learning_rate": 4.9954421017302947e-05,
"loss": 0.1893,
"num_input_tokens_seen": 158240,
"step": 740
},
{
"epoch": 2.365079365079365,
"grad_norm": 4.727173328399658,
"learning_rate": 4.9950144974193364e-05,
"loss": 0.207,
"num_input_tokens_seen": 159312,
"step": 745
},
{
"epoch": 2.380952380952381,
"grad_norm": 1.829289197921753,
"learning_rate": 4.994567744093703e-05,
"loss": 0.0484,
"num_input_tokens_seen": 160304,
"step": 750
},
{
"epoch": 2.3968253968253967,
"grad_norm": 2.2619996070861816,
"learning_rate": 4.9941018451821866e-05,
"loss": 0.1312,
"num_input_tokens_seen": 161424,
"step": 755
},
{
"epoch": 2.4126984126984126,
"grad_norm": 3.3826417922973633,
"learning_rate": 4.993616804260521e-05,
"loss": 0.1096,
"num_input_tokens_seen": 162480,
"step": 760
},
{
"epoch": 2.4285714285714284,
"grad_norm": 1.208579659461975,
"learning_rate": 4.9931126250513516e-05,
"loss": 0.0433,
"num_input_tokens_seen": 163568,
"step": 765
},
{
"epoch": 2.4444444444444446,
"grad_norm": 2.259871006011963,
"learning_rate": 4.992589311424208e-05,
"loss": 0.0786,
"num_input_tokens_seen": 164608,
"step": 770
},
{
"epoch": 2.4603174603174605,
"grad_norm": 1.2732272148132324,
"learning_rate": 4.992046867395478e-05,
"loss": 0.1996,
"num_input_tokens_seen": 165712,
"step": 775
},
{
"epoch": 2.4761904761904763,
"grad_norm": 5.162652492523193,
"learning_rate": 4.991485297128369e-05,
"loss": 0.1116,
"num_input_tokens_seen": 166736,
"step": 780
},
{
"epoch": 2.492063492063492,
"grad_norm": 3.201205253601074,
"learning_rate": 4.9909046049328846e-05,
"loss": 0.1202,
"num_input_tokens_seen": 167872,
"step": 785
},
{
"epoch": 2.507936507936508,
"grad_norm": 2.6469757556915283,
"learning_rate": 4.9903047952657856e-05,
"loss": 0.0957,
"num_input_tokens_seen": 168960,
"step": 790
},
{
"epoch": 2.5238095238095237,
"grad_norm": 1.523950219154358,
"learning_rate": 4.989685872730557e-05,
"loss": 0.0956,
"num_input_tokens_seen": 169968,
"step": 795
},
{
"epoch": 2.5396825396825395,
"grad_norm": 0.4077323377132416,
"learning_rate": 4.9890478420773746e-05,
"loss": 0.0856,
"num_input_tokens_seen": 171072,
"step": 800
},
{
"epoch": 2.5555555555555554,
"grad_norm": 1.0244964361190796,
"learning_rate": 4.988390708203068e-05,
"loss": 0.0398,
"num_input_tokens_seen": 172144,
"step": 805
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.2922608554363251,
"learning_rate": 4.9877144761510806e-05,
"loss": 0.1052,
"num_input_tokens_seen": 173264,
"step": 810
},
{
"epoch": 2.5873015873015874,
"grad_norm": 4.774709701538086,
"learning_rate": 4.987019151111433e-05,
"loss": 0.1851,
"num_input_tokens_seen": 174352,
"step": 815
},
{
"epoch": 2.6031746031746033,
"grad_norm": 4.259119510650635,
"learning_rate": 4.9863047384206835e-05,
"loss": 0.0609,
"num_input_tokens_seen": 175456,
"step": 820
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.8433787226676941,
"learning_rate": 4.9855712435618864e-05,
"loss": 0.0427,
"num_input_tokens_seen": 176576,
"step": 825
},
{
"epoch": 2.634920634920635,
"grad_norm": 1.5698630809783936,
"learning_rate": 4.9848186721645484e-05,
"loss": 0.1784,
"num_input_tokens_seen": 177632,
"step": 830
},
{
"epoch": 2.6507936507936507,
"grad_norm": 5.571624279022217,
"learning_rate": 4.98404703000459e-05,
"loss": 0.1248,
"num_input_tokens_seen": 178784,
"step": 835
},
{
"epoch": 2.6666666666666665,
"grad_norm": 3.441417932510376,
"learning_rate": 4.983256323004295e-05,
"loss": 0.0175,
"num_input_tokens_seen": 179888,
"step": 840
},
{
"epoch": 2.682539682539683,
"grad_norm": 0.750073254108429,
"learning_rate": 4.982446557232269e-05,
"loss": 0.1023,
"num_input_tokens_seen": 180976,
"step": 845
},
{
"epoch": 2.6984126984126986,
"grad_norm": 15.302396774291992,
"learning_rate": 4.981617738903393e-05,
"loss": 0.1268,
"num_input_tokens_seen": 182064,
"step": 850
},
{
"epoch": 2.7142857142857144,
"grad_norm": 3.7665250301361084,
"learning_rate": 4.9807698743787744e-05,
"loss": 0.1289,
"num_input_tokens_seen": 183168,
"step": 855
},
{
"epoch": 2.7301587301587302,
"grad_norm": 3.019672393798828,
"learning_rate": 4.9799029701656975e-05,
"loss": 0.0422,
"num_input_tokens_seen": 184192,
"step": 860
},
{
"epoch": 2.746031746031746,
"grad_norm": 0.010081956163048744,
"learning_rate": 4.9790170329175754e-05,
"loss": 0.0462,
"num_input_tokens_seen": 185248,
"step": 865
},
{
"epoch": 2.761904761904762,
"grad_norm": 3.496649980545044,
"learning_rate": 4.978112069433899e-05,
"loss": 0.106,
"num_input_tokens_seen": 186352,
"step": 870
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.3301718831062317,
"learning_rate": 4.97718808666018e-05,
"loss": 0.0115,
"num_input_tokens_seen": 187376,
"step": 875
},
{
"epoch": 2.7936507936507935,
"grad_norm": 0.4660927951335907,
"learning_rate": 4.976245091687906e-05,
"loss": 0.0609,
"num_input_tokens_seen": 188448,
"step": 880
},
{
"epoch": 2.8095238095238093,
"grad_norm": 2.0272302627563477,
"learning_rate": 4.975283091754479e-05,
"loss": 0.2057,
"num_input_tokens_seen": 189520,
"step": 885
},
{
"epoch": 2.825396825396825,
"grad_norm": 0.16836805641651154,
"learning_rate": 4.974302094243164e-05,
"loss": 0.0908,
"num_input_tokens_seen": 190608,
"step": 890
},
{
"epoch": 2.8412698412698414,
"grad_norm": 0.5413910150527954,
"learning_rate": 4.973302106683029e-05,
"loss": 0.1349,
"num_input_tokens_seen": 191760,
"step": 895
},
{
"epoch": 2.857142857142857,
"grad_norm": 3.522494077682495,
"learning_rate": 4.972283136748889e-05,
"loss": 0.0679,
"num_input_tokens_seen": 192784,
"step": 900
},
{
"epoch": 2.873015873015873,
"grad_norm": 1.0514668226242065,
"learning_rate": 4.971245192261249e-05,
"loss": 0.1376,
"num_input_tokens_seen": 193840,
"step": 905
},
{
"epoch": 2.888888888888889,
"grad_norm": 5.272775650024414,
"learning_rate": 4.970188281186241e-05,
"loss": 0.1013,
"num_input_tokens_seen": 194928,
"step": 910
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.7582864165306091,
"learning_rate": 4.9691124116355617e-05,
"loss": 0.0419,
"num_input_tokens_seen": 196016,
"step": 915
},
{
"epoch": 2.9206349206349205,
"grad_norm": 2.6769752502441406,
"learning_rate": 4.968017591866416e-05,
"loss": 0.2212,
"num_input_tokens_seen": 197152,
"step": 920
},
{
"epoch": 2.9365079365079367,
"grad_norm": 2.0376977920532227,
"learning_rate": 4.966903830281449e-05,
"loss": 0.0717,
"num_input_tokens_seen": 198208,
"step": 925
},
{
"epoch": 2.9523809523809526,
"grad_norm": 1.3439394235610962,
"learning_rate": 4.96577113542868e-05,
"loss": 0.0629,
"num_input_tokens_seen": 199296,
"step": 930
},
{
"epoch": 2.9682539682539684,
"grad_norm": 1.4719243049621582,
"learning_rate": 4.964619516001442e-05,
"loss": 0.0406,
"num_input_tokens_seen": 200352,
"step": 935
},
{
"epoch": 2.984126984126984,
"grad_norm": 0.41749292612075806,
"learning_rate": 4.963448980838312e-05,
"loss": 0.0661,
"num_input_tokens_seen": 201424,
"step": 940
},
{
"epoch": 3.0,
"grad_norm": 0.5001729726791382,
"learning_rate": 4.9622595389230445e-05,
"loss": 0.1283,
"num_input_tokens_seen": 202528,
"step": 945
},
{
"epoch": 3.0,
"eval_loss": 0.10918113589286804,
"eval_runtime": 1.4612,
"eval_samples_per_second": 47.905,
"eval_steps_per_second": 23.952,
"num_input_tokens_seen": 202528,
"step": 945
},
{
"epoch": 3.015873015873016,
"grad_norm": 3.957029104232788,
"learning_rate": 4.9610511993844986e-05,
"loss": 0.0249,
"num_input_tokens_seen": 203552,
"step": 950
},
{
"epoch": 3.0317460317460316,
"grad_norm": 2.8302228450775146,
"learning_rate": 4.959823971496574e-05,
"loss": 0.0556,
"num_input_tokens_seen": 204656,
"step": 955
},
{
"epoch": 3.0476190476190474,
"grad_norm": 0.05084440857172012,
"learning_rate": 4.9585778646781364e-05,
"loss": 0.0979,
"num_input_tokens_seen": 205744,
"step": 960
},
{
"epoch": 3.0634920634920633,
"grad_norm": 0.25631648302078247,
"learning_rate": 4.957312888492944e-05,
"loss": 0.0046,
"num_input_tokens_seen": 206800,
"step": 965
},
{
"epoch": 3.0793650793650795,
"grad_norm": 3.6322312355041504,
"learning_rate": 4.9560290526495764e-05,
"loss": 0.0374,
"num_input_tokens_seen": 207808,
"step": 970
},
{
"epoch": 3.0952380952380953,
"grad_norm": 5.5144171714782715,
"learning_rate": 4.954726367001361e-05,
"loss": 0.1074,
"num_input_tokens_seen": 208896,
"step": 975
},
{
"epoch": 3.111111111111111,
"grad_norm": 6.550605773925781,
"learning_rate": 4.9534048415462934e-05,
"loss": 0.0411,
"num_input_tokens_seen": 209920,
"step": 980
},
{
"epoch": 3.126984126984127,
"grad_norm": 0.2177072912454605,
"learning_rate": 4.952064486426965e-05,
"loss": 0.0858,
"num_input_tokens_seen": 210992,
"step": 985
},
{
"epoch": 3.142857142857143,
"grad_norm": 1.0022085905075073,
"learning_rate": 4.9507053119304805e-05,
"loss": 0.0535,
"num_input_tokens_seen": 212064,
"step": 990
},
{
"epoch": 3.1587301587301586,
"grad_norm": 1.5411404371261597,
"learning_rate": 4.9493273284883854e-05,
"loss": 0.0322,
"num_input_tokens_seen": 213152,
"step": 995
},
{
"epoch": 3.1746031746031744,
"grad_norm": 0.013450037688016891,
"learning_rate": 4.947930546676579e-05,
"loss": 0.0833,
"num_input_tokens_seen": 214192,
"step": 1000
},
{
"epoch": 3.1904761904761907,
"grad_norm": 2.3490381240844727,
"learning_rate": 4.946514977215238e-05,
"loss": 0.0799,
"num_input_tokens_seen": 215264,
"step": 1005
},
{
"epoch": 3.2063492063492065,
"grad_norm": 1.3413841724395752,
"learning_rate": 4.945080630968733e-05,
"loss": 0.0728,
"num_input_tokens_seen": 216320,
"step": 1010
},
{
"epoch": 3.2222222222222223,
"grad_norm": 2.792207956314087,
"learning_rate": 4.943627518945543e-05,
"loss": 0.0547,
"num_input_tokens_seen": 217376,
"step": 1015
},
{
"epoch": 3.238095238095238,
"grad_norm": 0.3178693652153015,
"learning_rate": 4.942155652298174e-05,
"loss": 0.0479,
"num_input_tokens_seen": 218512,
"step": 1020
},
{
"epoch": 3.253968253968254,
"grad_norm": 2.8839704990386963,
"learning_rate": 4.940665042323072e-05,
"loss": 0.0446,
"num_input_tokens_seen": 219552,
"step": 1025
},
{
"epoch": 3.2698412698412698,
"grad_norm": 3.0642411708831787,
"learning_rate": 4.939155700460536e-05,
"loss": 0.019,
"num_input_tokens_seen": 220720,
"step": 1030
},
{
"epoch": 3.2857142857142856,
"grad_norm": 2.056198835372925,
"learning_rate": 4.9376276382946304e-05,
"loss": 0.1516,
"num_input_tokens_seen": 221792,
"step": 1035
},
{
"epoch": 3.3015873015873014,
"grad_norm": 0.08110915124416351,
"learning_rate": 4.936080867553099e-05,
"loss": 0.0481,
"num_input_tokens_seen": 222880,
"step": 1040
},
{
"epoch": 3.317460317460317,
"grad_norm": 2.887831449508667,
"learning_rate": 4.934515400107266e-05,
"loss": 0.0852,
"num_input_tokens_seen": 223984,
"step": 1045
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.9948076009750366,
"learning_rate": 4.932931247971958e-05,
"loss": 0.0345,
"num_input_tokens_seen": 225056,
"step": 1050
},
{
"epoch": 3.3492063492063493,
"grad_norm": 4.810222625732422,
"learning_rate": 4.9313284233054004e-05,
"loss": 0.1105,
"num_input_tokens_seen": 226160,
"step": 1055
},
{
"epoch": 3.365079365079365,
"grad_norm": 0.02646580897271633,
"learning_rate": 4.9297069384091306e-05,
"loss": 0.0325,
"num_input_tokens_seen": 227232,
"step": 1060
},
{
"epoch": 3.380952380952381,
"grad_norm": 0.13596875965595245,
"learning_rate": 4.9280668057279014e-05,
"loss": 0.0125,
"num_input_tokens_seen": 228304,
"step": 1065
},
{
"epoch": 3.3968253968253967,
"grad_norm": 0.041388608515262604,
"learning_rate": 4.9264080378495846e-05,
"loss": 0.0326,
"num_input_tokens_seen": 229344,
"step": 1070
},
{
"epoch": 3.4126984126984126,
"grad_norm": 1.731412410736084,
"learning_rate": 4.924730647505078e-05,
"loss": 0.0411,
"num_input_tokens_seen": 230368,
"step": 1075
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.9932308793067932,
"learning_rate": 4.923034647568202e-05,
"loss": 0.0689,
"num_input_tokens_seen": 231408,
"step": 1080
},
{
"epoch": 3.4444444444444446,
"grad_norm": 4.429158687591553,
"learning_rate": 4.921320051055606e-05,
"loss": 0.0849,
"num_input_tokens_seen": 232464,
"step": 1085
},
{
"epoch": 3.4603174603174605,
"grad_norm": 0.05143646523356438,
"learning_rate": 4.919586871126667e-05,
"loss": 0.0295,
"num_input_tokens_seen": 233552,
"step": 1090
},
{
"epoch": 3.4761904761904763,
"grad_norm": 0.026359835639595985,
"learning_rate": 4.917835121083384e-05,
"loss": 0.0546,
"num_input_tokens_seen": 234624,
"step": 1095
},
{
"epoch": 3.492063492063492,
"grad_norm": 1.5680820941925049,
"learning_rate": 4.916064814370287e-05,
"loss": 0.0547,
"num_input_tokens_seen": 235696,
"step": 1100
},
{
"epoch": 3.507936507936508,
"grad_norm": 0.5346999168395996,
"learning_rate": 4.91427596457432e-05,
"loss": 0.0992,
"num_input_tokens_seen": 236704,
"step": 1105
},
{
"epoch": 3.5238095238095237,
"grad_norm": 3.054236888885498,
"learning_rate": 4.9124685854247465e-05,
"loss": 0.0986,
"num_input_tokens_seen": 237776,
"step": 1110
},
{
"epoch": 3.5396825396825395,
"grad_norm": 0.5600343942642212,
"learning_rate": 4.910642690793043e-05,
"loss": 0.048,
"num_input_tokens_seen": 238864,
"step": 1115
},
{
"epoch": 3.5555555555555554,
"grad_norm": 3.9517204761505127,
"learning_rate": 4.908798294692786e-05,
"loss": 0.1061,
"num_input_tokens_seen": 239856,
"step": 1120
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.37156346440315247,
"learning_rate": 4.906935411279553e-05,
"loss": 0.011,
"num_input_tokens_seen": 240896,
"step": 1125
},
{
"epoch": 3.5873015873015874,
"grad_norm": 0.4936334192752838,
"learning_rate": 4.9050540548508094e-05,
"loss": 0.0569,
"num_input_tokens_seen": 242000,
"step": 1130
},
{
"epoch": 3.6031746031746033,
"grad_norm": 10.88504695892334,
"learning_rate": 4.9031542398457974e-05,
"loss": 0.0373,
"num_input_tokens_seen": 243056,
"step": 1135
},
{
"epoch": 3.619047619047619,
"grad_norm": 1.345165491104126,
"learning_rate": 4.901235980845429e-05,
"loss": 0.0429,
"num_input_tokens_seen": 244112,
"step": 1140
},
{
"epoch": 3.634920634920635,
"grad_norm": 0.43414610624313354,
"learning_rate": 4.899299292572172e-05,
"loss": 0.0253,
"num_input_tokens_seen": 245216,
"step": 1145
},
{
"epoch": 3.6507936507936507,
"grad_norm": 1.8910837173461914,
"learning_rate": 4.897344189889936e-05,
"loss": 0.0114,
"num_input_tokens_seen": 246272,
"step": 1150
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.01159939169883728,
"learning_rate": 4.895370687803962e-05,
"loss": 0.0804,
"num_input_tokens_seen": 247392,
"step": 1155
},
{
"epoch": 3.682539682539683,
"grad_norm": 6.754380702972412,
"learning_rate": 4.893378801460702e-05,
"loss": 0.067,
"num_input_tokens_seen": 248480,
"step": 1160
},
{
"epoch": 3.6984126984126986,
"grad_norm": 1.1904765367507935,
"learning_rate": 4.8913685461477066e-05,
"loss": 0.0089,
"num_input_tokens_seen": 249536,
"step": 1165
},
{
"epoch": 3.7142857142857144,
"grad_norm": 1.1461362838745117,
"learning_rate": 4.889339937293508e-05,
"loss": 0.0426,
"num_input_tokens_seen": 250608,
"step": 1170
},
{
"epoch": 3.7301587301587302,
"grad_norm": 1.271159052848816,
"learning_rate": 4.8872929904674966e-05,
"loss": 0.0752,
"num_input_tokens_seen": 251632,
"step": 1175
},
{
"epoch": 3.746031746031746,
"grad_norm": 3.5823814868927,
"learning_rate": 4.8852277213798106e-05,
"loss": 0.076,
"num_input_tokens_seen": 252752,
"step": 1180
},
{
"epoch": 3.761904761904762,
"grad_norm": 0.5149009823799133,
"learning_rate": 4.883144145881205e-05,
"loss": 0.0884,
"num_input_tokens_seen": 253808,
"step": 1185
},
{
"epoch": 3.7777777777777777,
"grad_norm": 1.5054774284362793,
"learning_rate": 4.8810422799629375e-05,
"loss": 0.1014,
"num_input_tokens_seen": 254864,
"step": 1190
},
{
"epoch": 3.7936507936507935,
"grad_norm": 2.613424301147461,
"learning_rate": 4.878922139756641e-05,
"loss": 0.0201,
"num_input_tokens_seen": 255904,
"step": 1195
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.6467133164405823,
"learning_rate": 4.876783741534204e-05,
"loss": 0.0327,
"num_input_tokens_seen": 256976,
"step": 1200
},
{
"epoch": 3.825396825396825,
"grad_norm": 0.5593597292900085,
"learning_rate": 4.874627101707644e-05,
"loss": 0.0307,
"num_input_tokens_seen": 258016,
"step": 1205
},
{
"epoch": 3.8412698412698414,
"grad_norm": 2.611604928970337,
"learning_rate": 4.872452236828979e-05,
"loss": 0.1157,
"num_input_tokens_seen": 259072,
"step": 1210
},
{
"epoch": 3.857142857142857,
"grad_norm": 8.799260139465332,
"learning_rate": 4.870259163590103e-05,
"loss": 0.1447,
"num_input_tokens_seen": 260224,
"step": 1215
},
{
"epoch": 3.873015873015873,
"grad_norm": 1.5195016860961914,
"learning_rate": 4.8680478988226606e-05,
"loss": 0.0436,
"num_input_tokens_seen": 261312,
"step": 1220
},
{
"epoch": 3.888888888888889,
"grad_norm": 7.93579626083374,
"learning_rate": 4.865818459497911e-05,
"loss": 0.113,
"num_input_tokens_seen": 262352,
"step": 1225
},
{
"epoch": 3.9047619047619047,
"grad_norm": 1.4610778093338013,
"learning_rate": 4.863570862726603e-05,
"loss": 0.3412,
"num_input_tokens_seen": 263440,
"step": 1230
},
{
"epoch": 3.9206349206349205,
"grad_norm": 0.9858604073524475,
"learning_rate": 4.861305125758842e-05,
"loss": 0.0656,
"num_input_tokens_seen": 264480,
"step": 1235
},
{
"epoch": 3.9365079365079367,
"grad_norm": 5.148216724395752,
"learning_rate": 4.859021265983959e-05,
"loss": 0.0796,
"num_input_tokens_seen": 265616,
"step": 1240
},
{
"epoch": 3.9523809523809526,
"grad_norm": 0.5790823698043823,
"learning_rate": 4.856719300930375e-05,
"loss": 0.0355,
"num_input_tokens_seen": 266720,
"step": 1245
},
{
"epoch": 3.9682539682539684,
"grad_norm": 0.20517916977405548,
"learning_rate": 4.854399248265465e-05,
"loss": 0.0258,
"num_input_tokens_seen": 267824,
"step": 1250
},
{
"epoch": 3.984126984126984,
"grad_norm": 0.5621991753578186,
"learning_rate": 4.852061125795431e-05,
"loss": 0.0208,
"num_input_tokens_seen": 268832,
"step": 1255
},
{
"epoch": 4.0,
"grad_norm": 2.5780136585235596,
"learning_rate": 4.8497049514651514e-05,
"loss": 0.1397,
"num_input_tokens_seen": 269840,
"step": 1260
},
{
"epoch": 4.0,
"eval_loss": 0.10533556342124939,
"eval_runtime": 1.4561,
"eval_samples_per_second": 48.074,
"eval_steps_per_second": 24.037,
"num_input_tokens_seen": 269840,
"step": 1260
},
{
"epoch": 4.015873015873016,
"grad_norm": 0.0031560775823891163,
"learning_rate": 4.8473307433580575e-05,
"loss": 0.0282,
"num_input_tokens_seen": 270944,
"step": 1265
},
{
"epoch": 4.031746031746032,
"grad_norm": 0.22770990431308746,
"learning_rate": 4.844938519695984e-05,
"loss": 0.0168,
"num_input_tokens_seen": 272032,
"step": 1270
},
{
"epoch": 4.0476190476190474,
"grad_norm": 1.012162208557129,
"learning_rate": 4.8425282988390376e-05,
"loss": 0.0079,
"num_input_tokens_seen": 273104,
"step": 1275
},
{
"epoch": 4.063492063492063,
"grad_norm": 1.5583217144012451,
"learning_rate": 4.840100099285446e-05,
"loss": 0.0461,
"num_input_tokens_seen": 274144,
"step": 1280
},
{
"epoch": 4.079365079365079,
"grad_norm": 1.6686698198318481,
"learning_rate": 4.837653939671427e-05,
"loss": 0.0068,
"num_input_tokens_seen": 275216,
"step": 1285
},
{
"epoch": 4.095238095238095,
"grad_norm": 2.978330612182617,
"learning_rate": 4.8351898387710394e-05,
"loss": 0.0831,
"num_input_tokens_seen": 276352,
"step": 1290
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.0014791067223995924,
"learning_rate": 4.832707815496036e-05,
"loss": 0.001,
"num_input_tokens_seen": 277360,
"step": 1295
},
{
"epoch": 4.1269841269841265,
"grad_norm": 0.04002991318702698,
"learning_rate": 4.830207888895727e-05,
"loss": 0.0338,
"num_input_tokens_seen": 278448,
"step": 1300
},
{
"epoch": 4.142857142857143,
"grad_norm": 12.669215202331543,
"learning_rate": 4.827690078156826e-05,
"loss": 0.0323,
"num_input_tokens_seen": 279552,
"step": 1305
},
{
"epoch": 4.158730158730159,
"grad_norm": 2.2725536823272705,
"learning_rate": 4.825154402603308e-05,
"loss": 0.0369,
"num_input_tokens_seen": 280624,
"step": 1310
},
{
"epoch": 4.174603174603175,
"grad_norm": 0.0256810262799263,
"learning_rate": 4.822600881696256e-05,
"loss": 0.0193,
"num_input_tokens_seen": 281728,
"step": 1315
},
{
"epoch": 4.190476190476191,
"grad_norm": 2.8046810626983643,
"learning_rate": 4.820029535033719e-05,
"loss": 0.0931,
"num_input_tokens_seen": 282864,
"step": 1320
},
{
"epoch": 4.2063492063492065,
"grad_norm": 0.17485536634922028,
"learning_rate": 4.817440382350551e-05,
"loss": 0.0233,
"num_input_tokens_seen": 283952,
"step": 1325
},
{
"epoch": 4.222222222222222,
"grad_norm": 1.145687222480774,
"learning_rate": 4.814833443518271e-05,
"loss": 0.0168,
"num_input_tokens_seen": 284960,
"step": 1330
},
{
"epoch": 4.238095238095238,
"grad_norm": 0.14578741788864136,
"learning_rate": 4.812208738544901e-05,
"loss": 0.0517,
"num_input_tokens_seen": 286000,
"step": 1335
},
{
"epoch": 4.253968253968254,
"grad_norm": 0.21778497099876404,
"learning_rate": 4.809566287574821e-05,
"loss": 0.0311,
"num_input_tokens_seen": 287088,
"step": 1340
},
{
"epoch": 4.26984126984127,
"grad_norm": 2.5348594188690186,
"learning_rate": 4.806906110888606e-05,
"loss": 0.0074,
"num_input_tokens_seen": 288208,
"step": 1345
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.19400948286056519,
"learning_rate": 4.804228228902876e-05,
"loss": 0.0265,
"num_input_tokens_seen": 289232,
"step": 1350
},
{
"epoch": 4.301587301587301,
"grad_norm": 0.38872888684272766,
"learning_rate": 4.8015326621701386e-05,
"loss": 0.0044,
"num_input_tokens_seen": 290288,
"step": 1355
},
{
"epoch": 4.317460317460317,
"grad_norm": 0.0075505380518734455,
"learning_rate": 4.7988194313786275e-05,
"loss": 0.0011,
"num_input_tokens_seen": 291328,
"step": 1360
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.0007070943829603493,
"learning_rate": 4.796088557352148e-05,
"loss": 0.0647,
"num_input_tokens_seen": 292400,
"step": 1365
},
{
"epoch": 4.349206349206349,
"grad_norm": 2.9666683673858643,
"learning_rate": 4.7933400610499164e-05,
"loss": 0.104,
"num_input_tokens_seen": 293520,
"step": 1370
},
{
"epoch": 4.365079365079365,
"grad_norm": 0.0026558933313935995,
"learning_rate": 4.7905739635663984e-05,
"loss": 0.0021,
"num_input_tokens_seen": 294608,
"step": 1375
},
{
"epoch": 4.380952380952381,
"grad_norm": 9.051935195922852,
"learning_rate": 4.7877902861311446e-05,
"loss": 0.1086,
"num_input_tokens_seen": 295648,
"step": 1380
},
{
"epoch": 4.396825396825397,
"grad_norm": 0.06698460876941681,
"learning_rate": 4.784989050108634e-05,
"loss": 0.0073,
"num_input_tokens_seen": 296704,
"step": 1385
},
{
"epoch": 4.412698412698413,
"grad_norm": 0.30536431074142456,
"learning_rate": 4.782170276998104e-05,
"loss": 0.0017,
"num_input_tokens_seen": 297760,
"step": 1390
},
{
"epoch": 4.428571428571429,
"grad_norm": 0.0008031931356526911,
"learning_rate": 4.779333988433386e-05,
"loss": 0.0607,
"num_input_tokens_seen": 298848,
"step": 1395
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.02665814757347107,
"learning_rate": 4.7764802061827455e-05,
"loss": 0.0705,
"num_input_tokens_seen": 299936,
"step": 1400
},
{
"epoch": 4.4603174603174605,
"grad_norm": 1.0428848266601562,
"learning_rate": 4.773608952148706e-05,
"loss": 0.0549,
"num_input_tokens_seen": 301056,
"step": 1405
},
{
"epoch": 4.476190476190476,
"grad_norm": 0.06535102427005768,
"learning_rate": 4.770720248367887e-05,
"loss": 0.0352,
"num_input_tokens_seen": 302080,
"step": 1410
},
{
"epoch": 4.492063492063492,
"grad_norm": 0.04799408093094826,
"learning_rate": 4.7678141170108345e-05,
"loss": 0.0048,
"num_input_tokens_seen": 303104,
"step": 1415
},
{
"epoch": 4.507936507936508,
"grad_norm": 6.913591384887695,
"learning_rate": 4.764890580381849e-05,
"loss": 0.1083,
"num_input_tokens_seen": 304240,
"step": 1420
},
{
"epoch": 4.523809523809524,
"grad_norm": 4.7112956047058105,
"learning_rate": 4.761949660918814e-05,
"loss": 0.037,
"num_input_tokens_seen": 305360,
"step": 1425
},
{
"epoch": 4.5396825396825395,
"grad_norm": 0.5090931057929993,
"learning_rate": 4.7589913811930234e-05,
"loss": 0.0063,
"num_input_tokens_seen": 306416,
"step": 1430
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.002227133372798562,
"learning_rate": 4.756015763909014e-05,
"loss": 0.0192,
"num_input_tokens_seen": 307408,
"step": 1435
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.0006507275975309312,
"learning_rate": 4.753022831904383e-05,
"loss": 0.0223,
"num_input_tokens_seen": 308432,
"step": 1440
},
{
"epoch": 4.587301587301587,
"grad_norm": 3.9856300354003906,
"learning_rate": 4.750012608149618e-05,
"loss": 0.0237,
"num_input_tokens_seen": 309472,
"step": 1445
},
{
"epoch": 4.603174603174603,
"grad_norm": 1.1103370189666748,
"learning_rate": 4.7469851157479177e-05,
"loss": 0.1615,
"num_input_tokens_seen": 310512,
"step": 1450
},
{
"epoch": 4.619047619047619,
"grad_norm": 1.3942312002182007,
"learning_rate": 4.743940377935019e-05,
"loss": 0.0413,
"num_input_tokens_seen": 311632,
"step": 1455
},
{
"epoch": 4.634920634920634,
"grad_norm": 0.05732967332005501,
"learning_rate": 4.740878418079014e-05,
"loss": 0.0197,
"num_input_tokens_seen": 312688,
"step": 1460
},
{
"epoch": 4.650793650793651,
"grad_norm": 7.539809226989746,
"learning_rate": 4.737799259680172e-05,
"loss": 0.119,
"num_input_tokens_seen": 313760,
"step": 1465
},
{
"epoch": 4.666666666666667,
"grad_norm": 1.541993260383606,
"learning_rate": 4.73470292637076e-05,
"loss": 0.043,
"num_input_tokens_seen": 314816,
"step": 1470
},
{
"epoch": 4.682539682539683,
"grad_norm": 0.03592051565647125,
"learning_rate": 4.731589441914862e-05,
"loss": 0.013,
"num_input_tokens_seen": 315840,
"step": 1475
},
{
"epoch": 4.698412698412699,
"grad_norm": 5.116149425506592,
"learning_rate": 4.7284588302081946e-05,
"loss": 0.0987,
"num_input_tokens_seen": 316960,
"step": 1480
},
{
"epoch": 4.714285714285714,
"grad_norm": 0.07333391159772873,
"learning_rate": 4.725311115277924e-05,
"loss": 0.0773,
"num_input_tokens_seen": 318016,
"step": 1485
},
{
"epoch": 4.73015873015873,
"grad_norm": 2.1563048362731934,
"learning_rate": 4.7221463212824835e-05,
"loss": 0.0783,
"num_input_tokens_seen": 319136,
"step": 1490
},
{
"epoch": 4.746031746031746,
"grad_norm": 0.4926490783691406,
"learning_rate": 4.718964472511386e-05,
"loss": 0.0329,
"num_input_tokens_seen": 320288,
"step": 1495
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.00580169539898634,
"learning_rate": 4.715765593385036e-05,
"loss": 0.0217,
"num_input_tokens_seen": 321312,
"step": 1500
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.00831407681107521,
"learning_rate": 4.71254970845455e-05,
"loss": 0.0561,
"num_input_tokens_seen": 322368,
"step": 1505
},
{
"epoch": 4.7936507936507935,
"grad_norm": 0.007532383315265179,
"learning_rate": 4.709316842401557e-05,
"loss": 0.036,
"num_input_tokens_seen": 323392,
"step": 1510
},
{
"epoch": 4.809523809523809,
"grad_norm": 0.6496375799179077,
"learning_rate": 4.706067020038017e-05,
"loss": 0.0461,
"num_input_tokens_seen": 324576,
"step": 1515
},
{
"epoch": 4.825396825396825,
"grad_norm": 0.14115196466445923,
"learning_rate": 4.70280026630603e-05,
"loss": 0.0024,
"num_input_tokens_seen": 325648,
"step": 1520
},
{
"epoch": 4.841269841269841,
"grad_norm": 0.0023193114902824163,
"learning_rate": 4.699516606277638e-05,
"loss": 0.0036,
"num_input_tokens_seen": 326720,
"step": 1525
},
{
"epoch": 4.857142857142857,
"grad_norm": 1.0476378202438354,
"learning_rate": 4.6962160651546416e-05,
"loss": 0.0138,
"num_input_tokens_seen": 327808,
"step": 1530
},
{
"epoch": 4.8730158730158735,
"grad_norm": 0.0014104668516665697,
"learning_rate": 4.6928986682684004e-05,
"loss": 0.0093,
"num_input_tokens_seen": 328912,
"step": 1535
},
{
"epoch": 4.888888888888889,
"grad_norm": 1.5215229988098145,
"learning_rate": 4.6895644410796416e-05,
"loss": 0.0204,
"num_input_tokens_seen": 329952,
"step": 1540
},
{
"epoch": 4.904761904761905,
"grad_norm": 5.0272603034973145,
"learning_rate": 4.686213409178262e-05,
"loss": 0.0087,
"num_input_tokens_seen": 331008,
"step": 1545
},
{
"epoch": 4.920634920634921,
"grad_norm": 0.038513634353876114,
"learning_rate": 4.6828455982831334e-05,
"loss": 0.0004,
"num_input_tokens_seen": 332048,
"step": 1550
},
{
"epoch": 4.936507936507937,
"grad_norm": 0.07715779542922974,
"learning_rate": 4.679461034241906e-05,
"loss": 0.0687,
"num_input_tokens_seen": 333152,
"step": 1555
},
{
"epoch": 4.9523809523809526,
"grad_norm": 0.0013984109973534942,
"learning_rate": 4.6760597430308085e-05,
"loss": 0.0308,
"num_input_tokens_seen": 334160,
"step": 1560
},
{
"epoch": 4.968253968253968,
"grad_norm": 0.2820828855037689,
"learning_rate": 4.672641750754449e-05,
"loss": 0.0026,
"num_input_tokens_seen": 335184,
"step": 1565
},
{
"epoch": 4.984126984126984,
"grad_norm": 0.04478468745946884,
"learning_rate": 4.6692070836456126e-05,
"loss": 0.0442,
"num_input_tokens_seen": 336256,
"step": 1570
},
{
"epoch": 5.0,
"grad_norm": 2.1595120429992676,
"learning_rate": 4.6657557680650666e-05,
"loss": 0.0351,
"num_input_tokens_seen": 337408,
"step": 1575
},
{
"epoch": 5.0,
"eval_loss": 0.11039518564939499,
"eval_runtime": 1.4716,
"eval_samples_per_second": 47.568,
"eval_steps_per_second": 23.784,
"num_input_tokens_seen": 337408,
"step": 1575
},
{
"epoch": 5.015873015873016,
"grad_norm": 0.616256594657898,
"learning_rate": 4.6622878305013505e-05,
"loss": 0.0074,
"num_input_tokens_seen": 338496,
"step": 1580
},
{
"epoch": 5.031746031746032,
"grad_norm": 0.009507289156317711,
"learning_rate": 4.658803297570577e-05,
"loss": 0.0417,
"num_input_tokens_seen": 339568,
"step": 1585
},
{
"epoch": 5.0476190476190474,
"grad_norm": 2.117201089859009,
"learning_rate": 4.655302196016228e-05,
"loss": 0.0241,
"num_input_tokens_seen": 340608,
"step": 1590
},
{
"epoch": 5.063492063492063,
"grad_norm": 0.0259502362459898,
"learning_rate": 4.651784552708947e-05,
"loss": 0.0047,
"num_input_tokens_seen": 341648,
"step": 1595
},
{
"epoch": 5.079365079365079,
"grad_norm": 0.0022856765426695347,
"learning_rate": 4.6482503946463315e-05,
"loss": 0.0115,
"num_input_tokens_seen": 342768,
"step": 1600
},
{
"epoch": 5.095238095238095,
"grad_norm": 2.402535915374756,
"learning_rate": 4.644699748952733e-05,
"loss": 0.0139,
"num_input_tokens_seen": 343792,
"step": 1605
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.044295862317085266,
"learning_rate": 4.641132642879041e-05,
"loss": 0.0184,
"num_input_tokens_seen": 344832,
"step": 1610
},
{
"epoch": 5.1269841269841265,
"grad_norm": 0.0057604555040597916,
"learning_rate": 4.6375491038024785e-05,
"loss": 0.0412,
"num_input_tokens_seen": 345904,
"step": 1615
},
{
"epoch": 5.142857142857143,
"grad_norm": 0.006179352756589651,
"learning_rate": 4.6339491592263896e-05,
"loss": 0.0031,
"num_input_tokens_seen": 346912,
"step": 1620
},
{
"epoch": 5.158730158730159,
"grad_norm": 0.049357738345861435,
"learning_rate": 4.6303328367800284e-05,
"loss": 0.0134,
"num_input_tokens_seen": 348000,
"step": 1625
},
{
"epoch": 5.174603174603175,
"grad_norm": 14.490152359008789,
"learning_rate": 4.6267001642183496e-05,
"loss": 0.1828,
"num_input_tokens_seen": 349104,
"step": 1630
},
{
"epoch": 5.190476190476191,
"grad_norm": 0.024869563058018684,
"learning_rate": 4.6230511694217904e-05,
"loss": 0.0003,
"num_input_tokens_seen": 350192,
"step": 1635
},
{
"epoch": 5.2063492063492065,
"grad_norm": 0.1924477219581604,
"learning_rate": 4.619385880396064e-05,
"loss": 0.0231,
"num_input_tokens_seen": 351312,
"step": 1640
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.003086981363594532,
"learning_rate": 4.615704325271937e-05,
"loss": 0.0787,
"num_input_tokens_seen": 352368,
"step": 1645
},
{
"epoch": 5.238095238095238,
"grad_norm": 0.07954048365354538,
"learning_rate": 4.612006532305019e-05,
"loss": 0.0655,
"num_input_tokens_seen": 353408,
"step": 1650
},
{
"epoch": 5.253968253968254,
"grad_norm": 0.02726924791932106,
"learning_rate": 4.608292529875541e-05,
"loss": 0.0176,
"num_input_tokens_seen": 354464,
"step": 1655
},
{
"epoch": 5.26984126984127,
"grad_norm": 0.007141091860830784,
"learning_rate": 4.604562346488144e-05,
"loss": 0.0151,
"num_input_tokens_seen": 355488,
"step": 1660
},
{
"epoch": 5.285714285714286,
"grad_norm": 0.007293707691133022,
"learning_rate": 4.600816010771652e-05,
"loss": 0.0065,
"num_input_tokens_seen": 356544,
"step": 1665
},
{
"epoch": 5.301587301587301,
"grad_norm": 2.8001534938812256,
"learning_rate": 4.5970535514788596e-05,
"loss": 0.0226,
"num_input_tokens_seen": 357680,
"step": 1670
},
{
"epoch": 5.317460317460317,
"grad_norm": 0.003933146595954895,
"learning_rate": 4.593274997486309e-05,
"loss": 0.0061,
"num_input_tokens_seen": 358816,
"step": 1675
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.19064919650554657,
"learning_rate": 4.589480377794064e-05,
"loss": 0.0082,
"num_input_tokens_seen": 359872,
"step": 1680
},
{
"epoch": 5.349206349206349,
"grad_norm": 3.8689186573028564,
"learning_rate": 4.585669721525496e-05,
"loss": 0.0432,
"num_input_tokens_seen": 360928,
"step": 1685
},
{
"epoch": 5.365079365079365,
"grad_norm": 1.693311333656311,
"learning_rate": 4.581843057927053e-05,
"loss": 0.0032,
"num_input_tokens_seen": 362032,
"step": 1690
},
{
"epoch": 5.380952380952381,
"grad_norm": 0.000897102989256382,
"learning_rate": 4.5780004163680365e-05,
"loss": 0.0865,
"num_input_tokens_seen": 363168,
"step": 1695
},
{
"epoch": 5.396825396825397,
"grad_norm": 0.0013103618985041976,
"learning_rate": 4.574141826340382e-05,
"loss": 0.0484,
"num_input_tokens_seen": 364224,
"step": 1700
},
{
"epoch": 5.412698412698413,
"grad_norm": 4.227046489715576,
"learning_rate": 4.570267317458423e-05,
"loss": 0.0092,
"num_input_tokens_seen": 365376,
"step": 1705
},
{
"epoch": 5.428571428571429,
"grad_norm": 8.825630187988281,
"learning_rate": 4.566376919458672e-05,
"loss": 0.1041,
"num_input_tokens_seen": 366448,
"step": 1710
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.020684687420725822,
"learning_rate": 4.562470662199588e-05,
"loss": 0.0078,
"num_input_tokens_seen": 367552,
"step": 1715
},
{
"epoch": 5.4603174603174605,
"grad_norm": 0.7338157296180725,
"learning_rate": 4.5585485756613486e-05,
"loss": 0.0226,
"num_input_tokens_seen": 368672,
"step": 1720
},
{
"epoch": 5.476190476190476,
"grad_norm": 5.003768444061279,
"learning_rate": 4.5546106899456186e-05,
"loss": 0.0418,
"num_input_tokens_seen": 369744,
"step": 1725
},
{
"epoch": 5.492063492063492,
"grad_norm": 0.0029771197587251663,
"learning_rate": 4.550657035275323e-05,
"loss": 0.0051,
"num_input_tokens_seen": 370784,
"step": 1730
},
{
"epoch": 5.507936507936508,
"grad_norm": 2.141559600830078,
"learning_rate": 4.546687641994409e-05,
"loss": 0.0283,
"num_input_tokens_seen": 371872,
"step": 1735
},
{
"epoch": 5.523809523809524,
"grad_norm": 0.06480997800827026,
"learning_rate": 4.542702540567618e-05,
"loss": 0.0014,
"num_input_tokens_seen": 372912,
"step": 1740
},
{
"epoch": 5.5396825396825395,
"grad_norm": 5.961637020111084,
"learning_rate": 4.53870176158025e-05,
"loss": 0.0199,
"num_input_tokens_seen": 374080,
"step": 1745
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.004954809322953224,
"learning_rate": 4.534685335737926e-05,
"loss": 0.0001,
"num_input_tokens_seen": 375120,
"step": 1750
},
{
"epoch": 5.571428571428571,
"grad_norm": 2.826611042022705,
"learning_rate": 4.530653293866361e-05,
"loss": 0.008,
"num_input_tokens_seen": 376224,
"step": 1755
},
{
"epoch": 5.587301587301587,
"grad_norm": 8.976028442382812,
"learning_rate": 4.526605666911116e-05,
"loss": 0.0135,
"num_input_tokens_seen": 377248,
"step": 1760
},
{
"epoch": 5.603174603174603,
"grad_norm": 1.2282873392105103,
"learning_rate": 4.522542485937369e-05,
"loss": 0.0212,
"num_input_tokens_seen": 378288,
"step": 1765
},
{
"epoch": 5.619047619047619,
"grad_norm": 1.1623200178146362,
"learning_rate": 4.518463782129673e-05,
"loss": 0.0388,
"num_input_tokens_seen": 379376,
"step": 1770
},
{
"epoch": 5.634920634920634,
"grad_norm": 7.183865547180176,
"learning_rate": 4.514369586791718e-05,
"loss": 0.3362,
"num_input_tokens_seen": 380480,
"step": 1775
},
{
"epoch": 5.650793650793651,
"grad_norm": 0.01554072555154562,
"learning_rate": 4.510259931346088e-05,
"loss": 0.004,
"num_input_tokens_seen": 381584,
"step": 1780
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.002607325091958046,
"learning_rate": 4.506134847334026e-05,
"loss": 0.0069,
"num_input_tokens_seen": 382576,
"step": 1785
},
{
"epoch": 5.682539682539683,
"grad_norm": 0.003622877411544323,
"learning_rate": 4.5019943664151836e-05,
"loss": 0.003,
"num_input_tokens_seen": 383616,
"step": 1790
},
{
"epoch": 5.698412698412699,
"grad_norm": 0.0043928856030106544,
"learning_rate": 4.4978385203673845e-05,
"loss": 0.0011,
"num_input_tokens_seen": 384560,
"step": 1795
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.11677252501249313,
"learning_rate": 4.493667341086379e-05,
"loss": 0.0065,
"num_input_tokens_seen": 385568,
"step": 1800
},
{
"epoch": 5.73015873015873,
"grad_norm": 0.7192163467407227,
"learning_rate": 4.4894808605855966e-05,
"loss": 0.0052,
"num_input_tokens_seen": 386672,
"step": 1805
},
{
"epoch": 5.746031746031746,
"grad_norm": 0.004437014926224947,
"learning_rate": 4.485279110995903e-05,
"loss": 0.0114,
"num_input_tokens_seen": 387712,
"step": 1810
},
{
"epoch": 5.761904761904762,
"grad_norm": 0.002189048333093524,
"learning_rate": 4.481062124565354e-05,
"loss": 0.0072,
"num_input_tokens_seen": 388752,
"step": 1815
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.0018596505979076028,
"learning_rate": 4.476829933658946e-05,
"loss": 0.0228,
"num_input_tokens_seen": 389744,
"step": 1820
},
{
"epoch": 5.7936507936507935,
"grad_norm": 0.004972801543772221,
"learning_rate": 4.472582570758367e-05,
"loss": 0.0174,
"num_input_tokens_seen": 390800,
"step": 1825
},
{
"epoch": 5.809523809523809,
"grad_norm": 0.3761378228664398,
"learning_rate": 4.4683200684617516e-05,
"loss": 0.0444,
"num_input_tokens_seen": 391872,
"step": 1830
},
{
"epoch": 5.825396825396825,
"grad_norm": 3.0954763889312744,
"learning_rate": 4.464042459483425e-05,
"loss": 0.0032,
"num_input_tokens_seen": 393056,
"step": 1835
},
{
"epoch": 5.841269841269841,
"grad_norm": 0.12228795886039734,
"learning_rate": 4.459749776653658e-05,
"loss": 0.0005,
"num_input_tokens_seen": 394096,
"step": 1840
},
{
"epoch": 5.857142857142857,
"grad_norm": 0.0031736583914607763,
"learning_rate": 4.455442052918408e-05,
"loss": 0.0168,
"num_input_tokens_seen": 395200,
"step": 1845
},
{
"epoch": 5.8730158730158735,
"grad_norm": 6.396153450012207,
"learning_rate": 4.4511193213390736e-05,
"loss": 0.0638,
"num_input_tokens_seen": 396288,
"step": 1850
},
{
"epoch": 5.888888888888889,
"grad_norm": 9.804662704467773,
"learning_rate": 4.446781615092235e-05,
"loss": 0.0354,
"num_input_tokens_seen": 397344,
"step": 1855
},
{
"epoch": 5.904761904761905,
"grad_norm": 0.2461097687482834,
"learning_rate": 4.442428967469403e-05,
"loss": 0.0198,
"num_input_tokens_seen": 398480,
"step": 1860
},
{
"epoch": 5.920634920634921,
"grad_norm": 0.31156083941459656,
"learning_rate": 4.4380614118767604e-05,
"loss": 0.001,
"num_input_tokens_seen": 399536,
"step": 1865
},
{
"epoch": 5.936507936507937,
"grad_norm": 1.9175351858139038,
"learning_rate": 4.43367898183491e-05,
"loss": 0.0077,
"num_input_tokens_seen": 400624,
"step": 1870
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.004446444101631641,
"learning_rate": 4.429281710978612e-05,
"loss": 0.0241,
"num_input_tokens_seen": 401696,
"step": 1875
},
{
"epoch": 5.968253968253968,
"grad_norm": 0.01167643815279007,
"learning_rate": 4.4248696330565305e-05,
"loss": 0.0003,
"num_input_tokens_seen": 402768,
"step": 1880
},
{
"epoch": 5.984126984126984,
"grad_norm": 0.02273648977279663,
"learning_rate": 4.42044278193097e-05,
"loss": 0.0213,
"num_input_tokens_seen": 403872,
"step": 1885
},
{
"epoch": 6.0,
"grad_norm": 0.07438544929027557,
"learning_rate": 4.4160011915776224e-05,
"loss": 0.0005,
"num_input_tokens_seen": 404880,
"step": 1890
},
{
"epoch": 6.0,
"eval_loss": 0.11757393181324005,
"eval_runtime": 1.4812,
"eval_samples_per_second": 47.26,
"eval_steps_per_second": 23.63,
"num_input_tokens_seen": 404880,
"step": 1890
},
{
"epoch": 6.015873015873016,
"grad_norm": 0.0006745086866430938,
"learning_rate": 4.4115448960852965e-05,
"loss": 0.0045,
"num_input_tokens_seen": 405952,
"step": 1895
},
{
"epoch": 6.031746031746032,
"grad_norm": 0.01215858943760395,
"learning_rate": 4.407073929655666e-05,
"loss": 0.0054,
"num_input_tokens_seen": 407040,
"step": 1900
},
{
"epoch": 6.0476190476190474,
"grad_norm": 0.0003449621726758778,
"learning_rate": 4.402588326603002e-05,
"loss": 0.003,
"num_input_tokens_seen": 408128,
"step": 1905
},
{
"epoch": 6.063492063492063,
"grad_norm": 0.0432710237801075,
"learning_rate": 4.398088121353907e-05,
"loss": 0.0013,
"num_input_tokens_seen": 409168,
"step": 1910
},
{
"epoch": 6.079365079365079,
"grad_norm": 0.01051880232989788,
"learning_rate": 4.393573348447059e-05,
"loss": 0.0091,
"num_input_tokens_seen": 410208,
"step": 1915
},
{
"epoch": 6.095238095238095,
"grad_norm": 0.29065069556236267,
"learning_rate": 4.3890440425329367e-05,
"loss": 0.0187,
"num_input_tokens_seen": 411264,
"step": 1920
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.0008527844329364598,
"learning_rate": 4.384500238373563e-05,
"loss": 0.0015,
"num_input_tokens_seen": 412272,
"step": 1925
},
{
"epoch": 6.1269841269841265,
"grad_norm": 0.02228499948978424,
"learning_rate": 4.37994197084223e-05,
"loss": 0.0691,
"num_input_tokens_seen": 413312,
"step": 1930
},
{
"epoch": 6.142857142857143,
"grad_norm": 0.06491630524396896,
"learning_rate": 4.375369274923237e-05,
"loss": 0.0036,
"num_input_tokens_seen": 414352,
"step": 1935
},
{
"epoch": 6.158730158730159,
"grad_norm": 0.008396030403673649,
"learning_rate": 4.3707821857116176e-05,
"loss": 0.0002,
"num_input_tokens_seen": 415440,
"step": 1940
},
{
"epoch": 6.174603174603175,
"grad_norm": 0.4968828558921814,
"learning_rate": 4.366180738412876e-05,
"loss": 0.0009,
"num_input_tokens_seen": 416576,
"step": 1945
},
{
"epoch": 6.190476190476191,
"grad_norm": 6.394617557525635,
"learning_rate": 4.3615649683427094e-05,
"loss": 0.0095,
"num_input_tokens_seen": 417680,
"step": 1950
},
{
"epoch": 6.2063492063492065,
"grad_norm": 0.004320154897868633,
"learning_rate": 4.356934910926746e-05,
"loss": 0.0162,
"num_input_tokens_seen": 418784,
"step": 1955
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.004840894602239132,
"learning_rate": 4.352290601700263e-05,
"loss": 0.0001,
"num_input_tokens_seen": 419824,
"step": 1960
},
{
"epoch": 6.238095238095238,
"grad_norm": 0.009394151158630848,
"learning_rate": 4.347632076307921e-05,
"loss": 0.0011,
"num_input_tokens_seen": 420912,
"step": 1965
},
{
"epoch": 6.253968253968254,
"grad_norm": 9.433398246765137,
"learning_rate": 4.3429593705034896e-05,
"loss": 0.0521,
"num_input_tokens_seen": 421968,
"step": 1970
},
{
"epoch": 6.26984126984127,
"grad_norm": 0.0032175371889024973,
"learning_rate": 4.3382725201495723e-05,
"loss": 0.0005,
"num_input_tokens_seen": 423040,
"step": 1975
},
{
"epoch": 6.285714285714286,
"grad_norm": 0.002436830196529627,
"learning_rate": 4.333571561217326e-05,
"loss": 0.0234,
"num_input_tokens_seen": 424112,
"step": 1980
},
{
"epoch": 6.301587301587301,
"grad_norm": 0.001278755022212863,
"learning_rate": 4.328856529786196e-05,
"loss": 0.0071,
"num_input_tokens_seen": 425200,
"step": 1985
},
{
"epoch": 6.317460317460317,
"grad_norm": 0.0035431934520602226,
"learning_rate": 4.324127462043627e-05,
"loss": 0.0298,
"num_input_tokens_seen": 426240,
"step": 1990
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.05192156881093979,
"learning_rate": 4.319384394284797e-05,
"loss": 0.0013,
"num_input_tokens_seen": 427328,
"step": 1995
},
{
"epoch": 6.349206349206349,
"grad_norm": 0.0009910666849464178,
"learning_rate": 4.314627362912327e-05,
"loss": 0.0145,
"num_input_tokens_seen": 428320,
"step": 2000
},
{
"epoch": 6.365079365079365,
"grad_norm": 0.0481291264295578,
"learning_rate": 4.309856404436012e-05,
"loss": 0.0002,
"num_input_tokens_seen": 429360,
"step": 2005
},
{
"epoch": 6.380952380952381,
"grad_norm": 1.888258934020996,
"learning_rate": 4.305071555472534e-05,
"loss": 0.0035,
"num_input_tokens_seen": 430432,
"step": 2010
},
{
"epoch": 6.396825396825397,
"grad_norm": 0.028912700712680817,
"learning_rate": 4.300272852745184e-05,
"loss": 0.0283,
"num_input_tokens_seen": 431488,
"step": 2015
},
{
"epoch": 6.412698412698413,
"grad_norm": 0.34435898065567017,
"learning_rate": 4.2954603330835794e-05,
"loss": 0.0064,
"num_input_tokens_seen": 432544,
"step": 2020
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.0017212865641340613,
"learning_rate": 4.290634033423381e-05,
"loss": 0.0146,
"num_input_tokens_seen": 433664,
"step": 2025
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.38410279154777527,
"learning_rate": 4.2857939908060094e-05,
"loss": 0.0066,
"num_input_tokens_seen": 434784,
"step": 2030
},
{
"epoch": 6.4603174603174605,
"grad_norm": 0.005071816500276327,
"learning_rate": 4.2809402423783624e-05,
"loss": 0.0048,
"num_input_tokens_seen": 435856,
"step": 2035
},
{
"epoch": 6.476190476190476,
"grad_norm": 0.0019863054621964693,
"learning_rate": 4.276072825392528e-05,
"loss": 0.0061,
"num_input_tokens_seen": 436912,
"step": 2040
},
{
"epoch": 6.492063492063492,
"grad_norm": 0.0015548643423244357,
"learning_rate": 4.2711917772055e-05,
"loss": 0.025,
"num_input_tokens_seen": 437984,
"step": 2045
},
{
"epoch": 6.507936507936508,
"grad_norm": 0.008908271789550781,
"learning_rate": 4.2662971352788886e-05,
"loss": 0.0144,
"num_input_tokens_seen": 439040,
"step": 2050
},
{
"epoch": 6.523809523809524,
"grad_norm": 5.70704984664917,
"learning_rate": 4.261388937178636e-05,
"loss": 0.0701,
"num_input_tokens_seen": 440160,
"step": 2055
},
{
"epoch": 6.5396825396825395,
"grad_norm": 0.027168529108166695,
"learning_rate": 4.256467220574728e-05,
"loss": 0.0221,
"num_input_tokens_seen": 441264,
"step": 2060
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.1720827966928482,
"learning_rate": 4.251532023240901e-05,
"loss": 0.0012,
"num_input_tokens_seen": 442288,
"step": 2065
},
{
"epoch": 6.571428571428571,
"grad_norm": 0.4792938530445099,
"learning_rate": 4.246583383054357e-05,
"loss": 0.012,
"num_input_tokens_seen": 443344,
"step": 2070
},
{
"epoch": 6.587301587301587,
"grad_norm": 0.03523029014468193,
"learning_rate": 4.241621337995469e-05,
"loss": 0.0116,
"num_input_tokens_seen": 444480,
"step": 2075
},
{
"epoch": 6.603174603174603,
"grad_norm": 0.007401628885418177,
"learning_rate": 4.2366459261474933e-05,
"loss": 0.0536,
"num_input_tokens_seen": 445504,
"step": 2080
},
{
"epoch": 6.619047619047619,
"grad_norm": 0.007667475380003452,
"learning_rate": 4.2316571856962736e-05,
"loss": 0.0137,
"num_input_tokens_seen": 446544,
"step": 2085
},
{
"epoch": 6.634920634920634,
"grad_norm": 0.042317815124988556,
"learning_rate": 4.2266551549299496e-05,
"loss": 0.0007,
"num_input_tokens_seen": 447632,
"step": 2090
},
{
"epoch": 6.650793650793651,
"grad_norm": 0.008923333138227463,
"learning_rate": 4.221639872238662e-05,
"loss": 0.0121,
"num_input_tokens_seen": 448672,
"step": 2095
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.0037640526425093412,
"learning_rate": 4.2166113761142626e-05,
"loss": 0.0126,
"num_input_tokens_seen": 449792,
"step": 2100
},
{
"epoch": 6.682539682539683,
"grad_norm": 1.3072272539138794,
"learning_rate": 4.2115697051500104e-05,
"loss": 0.0179,
"num_input_tokens_seen": 450848,
"step": 2105
},
{
"epoch": 6.698412698412699,
"grad_norm": 0.001850523636676371,
"learning_rate": 4.2065148980402835e-05,
"loss": 0.0004,
"num_input_tokens_seen": 451952,
"step": 2110
},
{
"epoch": 6.714285714285714,
"grad_norm": 0.004209038335829973,
"learning_rate": 4.201446993580276e-05,
"loss": 0.0066,
"num_input_tokens_seen": 453008,
"step": 2115
},
{
"epoch": 6.73015873015873,
"grad_norm": 0.32531654834747314,
"learning_rate": 4.1963660306657074e-05,
"loss": 0.053,
"num_input_tokens_seen": 454080,
"step": 2120
},
{
"epoch": 6.746031746031746,
"grad_norm": 0.0027206395752727985,
"learning_rate": 4.191272048292513e-05,
"loss": 0.0001,
"num_input_tokens_seen": 455168,
"step": 2125
},
{
"epoch": 6.761904761904762,
"grad_norm": 0.0008674302371218801,
"learning_rate": 4.186165085556558e-05,
"loss": 0.0001,
"num_input_tokens_seen": 456272,
"step": 2130
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.019082186743617058,
"learning_rate": 4.181045181653327e-05,
"loss": 0.0741,
"num_input_tokens_seen": 457328,
"step": 2135
},
{
"epoch": 6.7936507936507935,
"grad_norm": 0.0038378050085157156,
"learning_rate": 4.175912375877628e-05,
"loss": 0.0443,
"num_input_tokens_seen": 458416,
"step": 2140
},
{
"epoch": 6.809523809523809,
"grad_norm": 0.00021153379930183291,
"learning_rate": 4.170766707623289e-05,
"loss": 0.0009,
"num_input_tokens_seen": 459520,
"step": 2145
},
{
"epoch": 6.825396825396825,
"grad_norm": 0.05300087854266167,
"learning_rate": 4.1656082163828566e-05,
"loss": 0.0099,
"num_input_tokens_seen": 460576,
"step": 2150
},
{
"epoch": 6.841269841269841,
"grad_norm": 0.0009460219880566001,
"learning_rate": 4.160436941747293e-05,
"loss": 0.0141,
"num_input_tokens_seen": 461648,
"step": 2155
},
{
"epoch": 6.857142857142857,
"grad_norm": 5.004072666168213,
"learning_rate": 4.155252923405672e-05,
"loss": 0.0033,
"num_input_tokens_seen": 462672,
"step": 2160
},
{
"epoch": 6.8730158730158735,
"grad_norm": 0.0005544586456380785,
"learning_rate": 4.1500562011448744e-05,
"loss": 0.0001,
"num_input_tokens_seen": 463760,
"step": 2165
},
{
"epoch": 6.888888888888889,
"grad_norm": 2.4104061126708984,
"learning_rate": 4.144846814849282e-05,
"loss": 0.0036,
"num_input_tokens_seen": 464784,
"step": 2170
},
{
"epoch": 6.904761904761905,
"grad_norm": 0.004637118428945541,
"learning_rate": 4.1396248045004703e-05,
"loss": 0.0017,
"num_input_tokens_seen": 465808,
"step": 2175
},
{
"epoch": 6.920634920634921,
"grad_norm": 0.05216735601425171,
"learning_rate": 4.134390210176907e-05,
"loss": 0.0002,
"num_input_tokens_seen": 466896,
"step": 2180
},
{
"epoch": 6.936507936507937,
"grad_norm": 0.0004108586290385574,
"learning_rate": 4.129143072053638e-05,
"loss": 0.0001,
"num_input_tokens_seen": 468000,
"step": 2185
},
{
"epoch": 6.9523809523809526,
"grad_norm": 0.009106731042265892,
"learning_rate": 4.1238834304019825e-05,
"loss": 0.0033,
"num_input_tokens_seen": 469056,
"step": 2190
},
{
"epoch": 6.968253968253968,
"grad_norm": 0.0018852164503186941,
"learning_rate": 4.118611325589222e-05,
"loss": 0.0151,
"num_input_tokens_seen": 470176,
"step": 2195
},
{
"epoch": 6.984126984126984,
"grad_norm": 0.1440330594778061,
"learning_rate": 4.113326798078294e-05,
"loss": 0.0058,
"num_input_tokens_seen": 471216,
"step": 2200
},
{
"epoch": 7.0,
"grad_norm": 0.00032442359952256083,
"learning_rate": 4.108029888427476e-05,
"loss": 0.0416,
"num_input_tokens_seen": 472240,
"step": 2205
},
{
"epoch": 7.0,
"eval_loss": 0.20159904658794403,
"eval_runtime": 1.4568,
"eval_samples_per_second": 48.05,
"eval_steps_per_second": 24.025,
"num_input_tokens_seen": 472240,
"step": 2205
},
{
"epoch": 7.015873015873016,
"grad_norm": 0.2510806918144226,
"learning_rate": 4.1027206372900816e-05,
"loss": 0.0579,
"num_input_tokens_seen": 473312,
"step": 2210
},
{
"epoch": 7.031746031746032,
"grad_norm": 10.269424438476562,
"learning_rate": 4.09739908541414e-05,
"loss": 0.0236,
"num_input_tokens_seen": 474416,
"step": 2215
},
{
"epoch": 7.0476190476190474,
"grad_norm": 7.632909774780273,
"learning_rate": 4.09206527364209e-05,
"loss": 0.0409,
"num_input_tokens_seen": 475504,
"step": 2220
},
{
"epoch": 7.063492063492063,
"grad_norm": 7.554219722747803,
"learning_rate": 4.0867192429104627e-05,
"loss": 0.0383,
"num_input_tokens_seen": 476512,
"step": 2225
},
{
"epoch": 7.079365079365079,
"grad_norm": 0.004015278071165085,
"learning_rate": 4.08136103424957e-05,
"loss": 0.0046,
"num_input_tokens_seen": 477648,
"step": 2230
},
{
"epoch": 7.095238095238095,
"grad_norm": 2.802708148956299,
"learning_rate": 4.075990688783185e-05,
"loss": 0.0081,
"num_input_tokens_seen": 478768,
"step": 2235
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.0031349605415016413,
"learning_rate": 4.070608247728236e-05,
"loss": 0.0003,
"num_input_tokens_seen": 479840,
"step": 2240
},
{
"epoch": 7.1269841269841265,
"grad_norm": 0.017263107001781464,
"learning_rate": 4.065213752394478e-05,
"loss": 0.0007,
"num_input_tokens_seen": 480928,
"step": 2245
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.0022108752746134996,
"learning_rate": 4.059807244184183e-05,
"loss": 0.0026,
"num_input_tokens_seen": 481984,
"step": 2250
},
{
"epoch": 7.158730158730159,
"grad_norm": 0.007918480783700943,
"learning_rate": 4.054388764591822e-05,
"loss": 0.0006,
"num_input_tokens_seen": 483104,
"step": 2255
},
{
"epoch": 7.174603174603175,
"grad_norm": 0.06933493167161942,
"learning_rate": 4.048958355203746e-05,
"loss": 0.0088,
"num_input_tokens_seen": 484240,
"step": 2260
},
{
"epoch": 7.190476190476191,
"grad_norm": 0.16983921825885773,
"learning_rate": 4.043516057697862e-05,
"loss": 0.0006,
"num_input_tokens_seen": 485328,
"step": 2265
},
{
"epoch": 7.2063492063492065,
"grad_norm": 0.004488496109843254,
"learning_rate": 4.038061913843322e-05,
"loss": 0.0005,
"num_input_tokens_seen": 486432,
"step": 2270
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.0012461389414966106,
"learning_rate": 4.032595965500195e-05,
"loss": 0.0007,
"num_input_tokens_seen": 487520,
"step": 2275
},
{
"epoch": 7.238095238095238,
"grad_norm": 0.0025354893878102303,
"learning_rate": 4.02711825461915e-05,
"loss": 0.0269,
"num_input_tokens_seen": 488576,
"step": 2280
},
{
"epoch": 7.253968253968254,
"grad_norm": 0.16530795395374298,
"learning_rate": 4.0216288232411296e-05,
"loss": 0.0005,
"num_input_tokens_seen": 489648,
"step": 2285
},
{
"epoch": 7.26984126984127,
"grad_norm": 0.09864303469657898,
"learning_rate": 4.0161277134970345e-05,
"loss": 0.0005,
"num_input_tokens_seen": 490736,
"step": 2290
},
{
"epoch": 7.285714285714286,
"grad_norm": 0.24059996008872986,
"learning_rate": 4.010614967607391e-05,
"loss": 0.0003,
"num_input_tokens_seen": 491792,
"step": 2295
},
{
"epoch": 7.301587301587301,
"grad_norm": 0.06463675945997238,
"learning_rate": 4.005090627882035e-05,
"loss": 0.0018,
"num_input_tokens_seen": 492832,
"step": 2300
},
{
"epoch": 7.317460317460317,
"grad_norm": 0.0006509863305836916,
"learning_rate": 3.9995547367197845e-05,
"loss": 0.0005,
"num_input_tokens_seen": 493824,
"step": 2305
},
{
"epoch": 7.333333333333333,
"grad_norm": 3.077014684677124,
"learning_rate": 3.9940073366081114e-05,
"loss": 0.0173,
"num_input_tokens_seen": 494880,
"step": 2310
},
{
"epoch": 7.349206349206349,
"grad_norm": 0.0008826262201182544,
"learning_rate": 3.988448470122819e-05,
"loss": 0.0024,
"num_input_tokens_seen": 495952,
"step": 2315
},
{
"epoch": 7.365079365079365,
"grad_norm": 1.0985521078109741,
"learning_rate": 3.982878179927714e-05,
"loss": 0.0028,
"num_input_tokens_seen": 497024,
"step": 2320
},
{
"epoch": 7.380952380952381,
"grad_norm": 0.0007766135386191308,
"learning_rate": 3.977296508774278e-05,
"loss": 0.0,
"num_input_tokens_seen": 498048,
"step": 2325
},
{
"epoch": 7.396825396825397,
"grad_norm": 0.001252246554940939,
"learning_rate": 3.971703499501344e-05,
"loss": 0.002,
"num_input_tokens_seen": 499120,
"step": 2330
},
{
"epoch": 7.412698412698413,
"grad_norm": 5.723186016082764,
"learning_rate": 3.9660991950347576e-05,
"loss": 0.0089,
"num_input_tokens_seen": 500160,
"step": 2335
},
{
"epoch": 7.428571428571429,
"grad_norm": 0.0003589835832826793,
"learning_rate": 3.960483638387061e-05,
"loss": 0.0002,
"num_input_tokens_seen": 501264,
"step": 2340
},
{
"epoch": 7.444444444444445,
"grad_norm": 5.556662082672119,
"learning_rate": 3.954856872657151e-05,
"loss": 0.0392,
"num_input_tokens_seen": 502384,
"step": 2345
},
{
"epoch": 7.4603174603174605,
"grad_norm": 0.05227256566286087,
"learning_rate": 3.9492189410299566e-05,
"loss": 0.0012,
"num_input_tokens_seen": 503456,
"step": 2350
},
{
"epoch": 7.476190476190476,
"grad_norm": 0.0018900517607107759,
"learning_rate": 3.9435698867760996e-05,
"loss": 0.0175,
"num_input_tokens_seen": 504480,
"step": 2355
},
{
"epoch": 7.492063492063492,
"grad_norm": 19.124439239501953,
"learning_rate": 3.9379097532515725e-05,
"loss": 0.0634,
"num_input_tokens_seen": 505584,
"step": 2360
},
{
"epoch": 7.507936507936508,
"grad_norm": 0.006673491094261408,
"learning_rate": 3.932238583897395e-05,
"loss": 0.0,
"num_input_tokens_seen": 506640,
"step": 2365
},
{
"epoch": 7.523809523809524,
"grad_norm": 0.0003134336438961327,
"learning_rate": 3.9265564222392905e-05,
"loss": 0.0,
"num_input_tokens_seen": 507696,
"step": 2370
},
{
"epoch": 7.5396825396825395,
"grad_norm": 0.0006025207112543285,
"learning_rate": 3.920863311887344e-05,
"loss": 0.0001,
"num_input_tokens_seen": 508688,
"step": 2375
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.07202655076980591,
"learning_rate": 3.9151592965356705e-05,
"loss": 0.0002,
"num_input_tokens_seen": 509792,
"step": 2380
},
{
"epoch": 7.571428571428571,
"grad_norm": 0.012028384022414684,
"learning_rate": 3.909444419962083e-05,
"loss": 0.0,
"num_input_tokens_seen": 510880,
"step": 2385
},
{
"epoch": 7.587301587301587,
"grad_norm": 0.031248344108462334,
"learning_rate": 3.9037187260277515e-05,
"loss": 0.0154,
"num_input_tokens_seen": 511888,
"step": 2390
},
{
"epoch": 7.603174603174603,
"grad_norm": 6.459189414978027,
"learning_rate": 3.897982258676867e-05,
"loss": 0.0308,
"num_input_tokens_seen": 513008,
"step": 2395
},
{
"epoch": 7.619047619047619,
"grad_norm": 4.979741096496582,
"learning_rate": 3.892235061936309e-05,
"loss": 0.0406,
"num_input_tokens_seen": 514048,
"step": 2400
},
{
"epoch": 7.634920634920634,
"grad_norm": 4.767922401428223,
"learning_rate": 3.886477179915301e-05,
"loss": 0.0185,
"num_input_tokens_seen": 515200,
"step": 2405
},
{
"epoch": 7.650793650793651,
"grad_norm": 0.14602185785770416,
"learning_rate": 3.880708656805075e-05,
"loss": 0.0118,
"num_input_tokens_seen": 516224,
"step": 2410
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.0051375702023506165,
"learning_rate": 3.874929536878536e-05,
"loss": 0.0041,
"num_input_tokens_seen": 517392,
"step": 2415
},
{
"epoch": 7.682539682539683,
"grad_norm": 7.391900539398193,
"learning_rate": 3.869139864489915e-05,
"loss": 0.0216,
"num_input_tokens_seen": 518464,
"step": 2420
},
{
"epoch": 7.698412698412699,
"grad_norm": 7.830747604370117,
"learning_rate": 3.863339684074432e-05,
"loss": 0.0318,
"num_input_tokens_seen": 519520,
"step": 2425
},
{
"epoch": 7.714285714285714,
"grad_norm": 0.024749888107180595,
"learning_rate": 3.8575290401479586e-05,
"loss": 0.0207,
"num_input_tokens_seen": 520640,
"step": 2430
},
{
"epoch": 7.73015873015873,
"grad_norm": 0.36452969908714294,
"learning_rate": 3.85170797730667e-05,
"loss": 0.0031,
"num_input_tokens_seen": 521648,
"step": 2435
},
{
"epoch": 7.746031746031746,
"grad_norm": 0.03560644015669823,
"learning_rate": 3.845876540226706e-05,
"loss": 0.0003,
"num_input_tokens_seen": 522720,
"step": 2440
},
{
"epoch": 7.761904761904762,
"grad_norm": 0.005877711810171604,
"learning_rate": 3.840034773663829e-05,
"loss": 0.0177,
"num_input_tokens_seen": 523808,
"step": 2445
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.0006981467013247311,
"learning_rate": 3.834182722453079e-05,
"loss": 0.0082,
"num_input_tokens_seen": 524864,
"step": 2450
},
{
"epoch": 7.7936507936507935,
"grad_norm": 0.06691333651542664,
"learning_rate": 3.828320431508429e-05,
"loss": 0.0113,
"num_input_tokens_seen": 525952,
"step": 2455
},
{
"epoch": 7.809523809523809,
"grad_norm": 0.7026005387306213,
"learning_rate": 3.8224479458224396e-05,
"loss": 0.0007,
"num_input_tokens_seen": 527008,
"step": 2460
},
{
"epoch": 7.825396825396825,
"grad_norm": 0.0011852675816044211,
"learning_rate": 3.8165653104659185e-05,
"loss": 0.0187,
"num_input_tokens_seen": 528000,
"step": 2465
},
{
"epoch": 7.841269841269841,
"grad_norm": 0.0029516047798097134,
"learning_rate": 3.81067257058757e-05,
"loss": 0.0002,
"num_input_tokens_seen": 529088,
"step": 2470
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.2371506243944168,
"learning_rate": 3.804769771413649e-05,
"loss": 0.0006,
"num_input_tokens_seen": 530144,
"step": 2475
},
{
"epoch": 7.8730158730158735,
"grad_norm": 0.015095270238816738,
"learning_rate": 3.7988569582476144e-05,
"loss": 0.0227,
"num_input_tokens_seen": 531232,
"step": 2480
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.0005210601957514882,
"learning_rate": 3.7929341764697816e-05,
"loss": 0.0028,
"num_input_tokens_seen": 532240,
"step": 2485
},
{
"epoch": 7.904761904761905,
"grad_norm": 0.07286439836025238,
"learning_rate": 3.787001471536976e-05,
"loss": 0.0031,
"num_input_tokens_seen": 533280,
"step": 2490
},
{
"epoch": 7.920634920634921,
"grad_norm": 0.004520993679761887,
"learning_rate": 3.78105888898218e-05,
"loss": 0.0,
"num_input_tokens_seen": 534352,
"step": 2495
},
{
"epoch": 7.936507936507937,
"grad_norm": 0.013191165402531624,
"learning_rate": 3.775106474414188e-05,
"loss": 0.0012,
"num_input_tokens_seen": 535392,
"step": 2500
},
{
"epoch": 7.9523809523809526,
"grad_norm": 0.02418154664337635,
"learning_rate": 3.769144273517253e-05,
"loss": 0.0001,
"num_input_tokens_seen": 536496,
"step": 2505
},
{
"epoch": 7.968253968253968,
"grad_norm": 0.009249486960470676,
"learning_rate": 3.7631723320507364e-05,
"loss": 0.019,
"num_input_tokens_seen": 537552,
"step": 2510
},
{
"epoch": 7.984126984126984,
"grad_norm": 0.002061500446870923,
"learning_rate": 3.7571906958487584e-05,
"loss": 0.0246,
"num_input_tokens_seen": 538656,
"step": 2515
},
{
"epoch": 8.0,
"grad_norm": 0.028927626088261604,
"learning_rate": 3.751199410819847e-05,
"loss": 0.0112,
"num_input_tokens_seen": 539744,
"step": 2520
},
{
"epoch": 8.0,
"eval_loss": 0.13668464124202728,
"eval_runtime": 1.4554,
"eval_samples_per_second": 48.095,
"eval_steps_per_second": 24.048,
"num_input_tokens_seen": 539744,
"step": 2520
},
{
"epoch": 8.015873015873016,
"grad_norm": 0.0014253915287554264,
"learning_rate": 3.745198522946582e-05,
"loss": 0.0003,
"num_input_tokens_seen": 540800,
"step": 2525
},
{
"epoch": 8.031746031746032,
"grad_norm": 0.07428203523159027,
"learning_rate": 3.739188078285244e-05,
"loss": 0.0007,
"num_input_tokens_seen": 541936,
"step": 2530
},
{
"epoch": 8.047619047619047,
"grad_norm": 6.837933540344238,
"learning_rate": 3.7331681229654635e-05,
"loss": 0.01,
"num_input_tokens_seen": 543040,
"step": 2535
},
{
"epoch": 8.063492063492063,
"grad_norm": 0.0031517180614173412,
"learning_rate": 3.727138703189862e-05,
"loss": 0.0068,
"num_input_tokens_seen": 544016,
"step": 2540
},
{
"epoch": 8.079365079365079,
"grad_norm": 0.00036266801180318,
"learning_rate": 3.721099865233701e-05,
"loss": 0.0012,
"num_input_tokens_seen": 545120,
"step": 2545
},
{
"epoch": 8.095238095238095,
"grad_norm": 0.00027236941969022155,
"learning_rate": 3.7150516554445256e-05,
"loss": 0.0006,
"num_input_tokens_seen": 546256,
"step": 2550
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.00025823916075751185,
"learning_rate": 3.708994120241809e-05,
"loss": 0.0001,
"num_input_tokens_seen": 547344,
"step": 2555
},
{
"epoch": 8.126984126984127,
"grad_norm": 0.0003119041211903095,
"learning_rate": 3.702927306116595e-05,
"loss": 0.0002,
"num_input_tokens_seen": 548352,
"step": 2560
},
{
"epoch": 8.142857142857142,
"grad_norm": 0.00040353136137127876,
"learning_rate": 3.6968512596311435e-05,
"loss": 0.0102,
"num_input_tokens_seen": 549328,
"step": 2565
},
{
"epoch": 8.158730158730158,
"grad_norm": 0.0004562221292871982,
"learning_rate": 3.690766027418573e-05,
"loss": 0.0001,
"num_input_tokens_seen": 550416,
"step": 2570
},
{
"epoch": 8.174603174603174,
"grad_norm": 0.006335284095257521,
"learning_rate": 3.6846716561824965e-05,
"loss": 0.0003,
"num_input_tokens_seen": 551504,
"step": 2575
},
{
"epoch": 8.19047619047619,
"grad_norm": 0.0002137407718691975,
"learning_rate": 3.678568192696677e-05,
"loss": 0.0022,
"num_input_tokens_seen": 552640,
"step": 2580
},
{
"epoch": 8.206349206349206,
"grad_norm": 0.000498905370477587,
"learning_rate": 3.672455683804651e-05,
"loss": 0.0127,
"num_input_tokens_seen": 553712,
"step": 2585
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.0003141605411656201,
"learning_rate": 3.6663341764193834e-05,
"loss": 0.0035,
"num_input_tokens_seen": 554800,
"step": 2590
},
{
"epoch": 8.238095238095237,
"grad_norm": 3.5881121158599854,
"learning_rate": 3.6602037175228986e-05,
"loss": 0.021,
"num_input_tokens_seen": 555824,
"step": 2595
},
{
"epoch": 8.253968253968253,
"grad_norm": 0.008271710947155952,
"learning_rate": 3.6540643541659245e-05,
"loss": 0.0001,
"num_input_tokens_seen": 556944,
"step": 2600
},
{
"epoch": 8.26984126984127,
"grad_norm": 0.0003118402964901179,
"learning_rate": 3.6479161334675296e-05,
"loss": 0.0001,
"num_input_tokens_seen": 557968,
"step": 2605
},
{
"epoch": 8.285714285714286,
"grad_norm": 0.010142582468688488,
"learning_rate": 3.641759102614761e-05,
"loss": 0.024,
"num_input_tokens_seen": 559104,
"step": 2610
},
{
"epoch": 8.301587301587302,
"grad_norm": 0.0003936065186280757,
"learning_rate": 3.6355933088622854e-05,
"loss": 0.0029,
"num_input_tokens_seen": 560128,
"step": 2615
},
{
"epoch": 8.317460317460318,
"grad_norm": 0.274812787771225,
"learning_rate": 3.6294187995320214e-05,
"loss": 0.0014,
"num_input_tokens_seen": 561248,
"step": 2620
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.00024725758703425527,
"learning_rate": 3.6232356220127785e-05,
"loss": 0.0003,
"num_input_tokens_seen": 562320,
"step": 2625
},
{
"epoch": 8.34920634920635,
"grad_norm": 0.0018928394420072436,
"learning_rate": 3.617043823759897e-05,
"loss": 0.0205,
"num_input_tokens_seen": 563392,
"step": 2630
},
{
"epoch": 8.365079365079366,
"grad_norm": 0.03286740928888321,
"learning_rate": 3.610843452294877e-05,
"loss": 0.0001,
"num_input_tokens_seen": 564480,
"step": 2635
},
{
"epoch": 8.380952380952381,
"grad_norm": 4.7357635498046875,
"learning_rate": 3.60463455520502e-05,
"loss": 0.0121,
"num_input_tokens_seen": 565616,
"step": 2640
},
{
"epoch": 8.396825396825397,
"grad_norm": 0.004558645188808441,
"learning_rate": 3.598417180143058e-05,
"loss": 0.0031,
"num_input_tokens_seen": 566704,
"step": 2645
},
{
"epoch": 8.412698412698413,
"grad_norm": 0.14909231662750244,
"learning_rate": 3.5921913748267945e-05,
"loss": 0.0276,
"num_input_tokens_seen": 567744,
"step": 2650
},
{
"epoch": 8.428571428571429,
"grad_norm": 0.00042908909381367266,
"learning_rate": 3.5859571870387304e-05,
"loss": 0.0,
"num_input_tokens_seen": 568816,
"step": 2655
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.012058122083544731,
"learning_rate": 3.579714664625706e-05,
"loss": 0.0004,
"num_input_tokens_seen": 569872,
"step": 2660
},
{
"epoch": 8.46031746031746,
"grad_norm": 0.11031365394592285,
"learning_rate": 3.5734638554985236e-05,
"loss": 0.0035,
"num_input_tokens_seen": 570976,
"step": 2665
},
{
"epoch": 8.476190476190476,
"grad_norm": 1.1275529861450195,
"learning_rate": 3.567204807631589e-05,
"loss": 0.0091,
"num_input_tokens_seen": 572064,
"step": 2670
},
{
"epoch": 8.492063492063492,
"grad_norm": 0.0019642009865492582,
"learning_rate": 3.560937569062538e-05,
"loss": 0.0003,
"num_input_tokens_seen": 573152,
"step": 2675
},
{
"epoch": 8.507936507936508,
"grad_norm": 0.027548756450414658,
"learning_rate": 3.554662187891873e-05,
"loss": 0.0057,
"num_input_tokens_seen": 574224,
"step": 2680
},
{
"epoch": 8.523809523809524,
"grad_norm": 0.0007244048174470663,
"learning_rate": 3.548378712282584e-05,
"loss": 0.0281,
"num_input_tokens_seen": 575360,
"step": 2685
},
{
"epoch": 8.53968253968254,
"grad_norm": 2.4872758388519287,
"learning_rate": 3.5420871904597895e-05,
"loss": 0.0024,
"num_input_tokens_seen": 576416,
"step": 2690
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.05758047476410866,
"learning_rate": 3.5357876707103596e-05,
"loss": 0.0003,
"num_input_tokens_seen": 577440,
"step": 2695
},
{
"epoch": 8.571428571428571,
"grad_norm": 3.4211556911468506,
"learning_rate": 3.529480201382551e-05,
"loss": 0.0024,
"num_input_tokens_seen": 578560,
"step": 2700
},
{
"epoch": 8.587301587301587,
"grad_norm": 0.19005665183067322,
"learning_rate": 3.523164830885629e-05,
"loss": 0.0111,
"num_input_tokens_seen": 579600,
"step": 2705
},
{
"epoch": 8.603174603174603,
"grad_norm": 0.00690614664927125,
"learning_rate": 3.516841607689501e-05,
"loss": 0.0006,
"num_input_tokens_seen": 580672,
"step": 2710
},
{
"epoch": 8.619047619047619,
"grad_norm": 0.0064396848902106285,
"learning_rate": 3.510510580324344e-05,
"loss": 0.0266,
"num_input_tokens_seen": 581728,
"step": 2715
},
{
"epoch": 8.634920634920634,
"grad_norm": 0.004690147936344147,
"learning_rate": 3.504171797380231e-05,
"loss": 0.0001,
"num_input_tokens_seen": 582752,
"step": 2720
},
{
"epoch": 8.65079365079365,
"grad_norm": 0.1530577689409256,
"learning_rate": 3.497825307506758e-05,
"loss": 0.0002,
"num_input_tokens_seen": 583856,
"step": 2725
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.0008610127260908484,
"learning_rate": 3.491471159412672e-05,
"loss": 0.0001,
"num_input_tokens_seen": 584960,
"step": 2730
},
{
"epoch": 8.682539682539682,
"grad_norm": 0.001741685438901186,
"learning_rate": 3.485109401865493e-05,
"loss": 0.0,
"num_input_tokens_seen": 586032,
"step": 2735
},
{
"epoch": 8.698412698412698,
"grad_norm": 1.3038556575775146,
"learning_rate": 3.478740083691147e-05,
"loss": 0.0104,
"num_input_tokens_seen": 587088,
"step": 2740
},
{
"epoch": 8.714285714285714,
"grad_norm": 0.11273951083421707,
"learning_rate": 3.4723632537735846e-05,
"loss": 0.0007,
"num_input_tokens_seen": 588128,
"step": 2745
},
{
"epoch": 8.73015873015873,
"grad_norm": 0.0002958101104013622,
"learning_rate": 3.46597896105441e-05,
"loss": 0.0,
"num_input_tokens_seen": 589152,
"step": 2750
},
{
"epoch": 8.746031746031747,
"grad_norm": 0.007939610630273819,
"learning_rate": 3.459587254532502e-05,
"loss": 0.0004,
"num_input_tokens_seen": 590208,
"step": 2755
},
{
"epoch": 8.761904761904763,
"grad_norm": 0.0005917858798056841,
"learning_rate": 3.453188183263639e-05,
"loss": 0.001,
"num_input_tokens_seen": 591344,
"step": 2760
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.0001375034626107663,
"learning_rate": 3.4467817963601264e-05,
"loss": 0.0007,
"num_input_tokens_seen": 592384,
"step": 2765
},
{
"epoch": 8.793650793650794,
"grad_norm": 0.010883470997214317,
"learning_rate": 3.440368142990416e-05,
"loss": 0.0105,
"num_input_tokens_seen": 593472,
"step": 2770
},
{
"epoch": 8.80952380952381,
"grad_norm": 0.0002069953188765794,
"learning_rate": 3.433947272378726e-05,
"loss": 0.0001,
"num_input_tokens_seen": 594560,
"step": 2775
},
{
"epoch": 8.825396825396826,
"grad_norm": 0.003037715097889304,
"learning_rate": 3.427519233804667e-05,
"loss": 0.0,
"num_input_tokens_seen": 595664,
"step": 2780
},
{
"epoch": 8.841269841269842,
"grad_norm": 0.002563037909567356,
"learning_rate": 3.421084076602867e-05,
"loss": 0.0096,
"num_input_tokens_seen": 596800,
"step": 2785
},
{
"epoch": 8.857142857142858,
"grad_norm": 0.007072729524224997,
"learning_rate": 3.414641850162584e-05,
"loss": 0.002,
"num_input_tokens_seen": 597904,
"step": 2790
},
{
"epoch": 8.873015873015873,
"grad_norm": 0.00026764694484882057,
"learning_rate": 3.408192603927334e-05,
"loss": 0.0001,
"num_input_tokens_seen": 598960,
"step": 2795
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.04823247343301773,
"learning_rate": 3.40173638739451e-05,
"loss": 0.0003,
"num_input_tokens_seen": 600080,
"step": 2800
},
{
"epoch": 8.904761904761905,
"grad_norm": 0.00041813074494712055,
"learning_rate": 3.395273250114999e-05,
"loss": 0.0,
"num_input_tokens_seen": 601104,
"step": 2805
},
{
"epoch": 8.920634920634921,
"grad_norm": 0.011840839870274067,
"learning_rate": 3.388803241692807e-05,
"loss": 0.0001,
"num_input_tokens_seen": 602160,
"step": 2810
},
{
"epoch": 8.936507936507937,
"grad_norm": 0.00030943809542804956,
"learning_rate": 3.382326411784672e-05,
"loss": 0.0,
"num_input_tokens_seen": 603184,
"step": 2815
},
{
"epoch": 8.952380952380953,
"grad_norm": 0.0344560369849205,
"learning_rate": 3.375842810099692e-05,
"loss": 0.0322,
"num_input_tokens_seen": 604208,
"step": 2820
},
{
"epoch": 8.968253968253968,
"grad_norm": 0.12215026468038559,
"learning_rate": 3.36935248639893e-05,
"loss": 0.0157,
"num_input_tokens_seen": 605344,
"step": 2825
},
{
"epoch": 8.984126984126984,
"grad_norm": 0.009739014320075512,
"learning_rate": 3.362855490495047e-05,
"loss": 0.0005,
"num_input_tokens_seen": 606416,
"step": 2830
},
{
"epoch": 9.0,
"grad_norm": 0.05252106115221977,
"learning_rate": 3.356351872251908e-05,
"loss": 0.0002,
"num_input_tokens_seen": 607456,
"step": 2835
},
{
"epoch": 9.0,
"eval_loss": 0.15105664730072021,
"eval_runtime": 1.4521,
"eval_samples_per_second": 48.206,
"eval_steps_per_second": 24.103,
"num_input_tokens_seen": 607456,
"step": 2835
},
{
"epoch": 9.015873015873016,
"grad_norm": 0.02688731625676155,
"learning_rate": 3.349841681584206e-05,
"loss": 0.0001,
"num_input_tokens_seen": 608528,
"step": 2840
},
{
"epoch": 9.031746031746032,
"grad_norm": 0.0004183761775493622,
"learning_rate": 3.343324968457076e-05,
"loss": 0.0049,
"num_input_tokens_seen": 609584,
"step": 2845
},
{
"epoch": 9.047619047619047,
"grad_norm": 0.000365029409294948,
"learning_rate": 3.336801782885712e-05,
"loss": 0.0002,
"num_input_tokens_seen": 610640,
"step": 2850
},
{
"epoch": 9.063492063492063,
"grad_norm": 0.00026122824056074023,
"learning_rate": 3.3302721749349834e-05,
"loss": 0.0001,
"num_input_tokens_seen": 611680,
"step": 2855
},
{
"epoch": 9.079365079365079,
"grad_norm": 0.00021482273587025702,
"learning_rate": 3.3237361947190536e-05,
"loss": 0.0001,
"num_input_tokens_seen": 612736,
"step": 2860
},
{
"epoch": 9.095238095238095,
"grad_norm": 0.00018885769532062113,
"learning_rate": 3.317193892400988e-05,
"loss": 0.0,
"num_input_tokens_seen": 613808,
"step": 2865
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.00014850927982479334,
"learning_rate": 3.310645318192378e-05,
"loss": 0.0001,
"num_input_tokens_seen": 614880,
"step": 2870
},
{
"epoch": 9.126984126984127,
"grad_norm": 0.1906156688928604,
"learning_rate": 3.304090522352946e-05,
"loss": 0.0003,
"num_input_tokens_seen": 616000,
"step": 2875
},
{
"epoch": 9.142857142857142,
"grad_norm": 0.14535604417324066,
"learning_rate": 3.2975295551901714e-05,
"loss": 0.0001,
"num_input_tokens_seen": 617072,
"step": 2880
},
{
"epoch": 9.158730158730158,
"grad_norm": 0.0005936230882070959,
"learning_rate": 3.290962467058891e-05,
"loss": 0.0002,
"num_input_tokens_seen": 618080,
"step": 2885
},
{
"epoch": 9.174603174603174,
"grad_norm": 0.00011047060252167284,
"learning_rate": 3.284389308360927e-05,
"loss": 0.0014,
"num_input_tokens_seen": 619152,
"step": 2890
},
{
"epoch": 9.19047619047619,
"grad_norm": 0.0350705049932003,
"learning_rate": 3.277810129544685e-05,
"loss": 0.0035,
"num_input_tokens_seen": 620224,
"step": 2895
},
{
"epoch": 9.206349206349206,
"grad_norm": 8.115587115753442e-05,
"learning_rate": 3.2712249811047785e-05,
"loss": 0.0,
"num_input_tokens_seen": 621312,
"step": 2900
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.00012116412835894153,
"learning_rate": 3.2646339135816386e-05,
"loss": 0.0,
"num_input_tokens_seen": 622400,
"step": 2905
},
{
"epoch": 9.238095238095237,
"grad_norm": 0.0076861935667693615,
"learning_rate": 3.258036977561123e-05,
"loss": 0.0007,
"num_input_tokens_seen": 623520,
"step": 2910
},
{
"epoch": 9.253968253968253,
"grad_norm": 0.0010363998590037227,
"learning_rate": 3.251434223674129e-05,
"loss": 0.0003,
"num_input_tokens_seen": 624624,
"step": 2915
},
{
"epoch": 9.26984126984127,
"grad_norm": 0.0008949214825406671,
"learning_rate": 3.244825702596205e-05,
"loss": 0.0,
"num_input_tokens_seen": 625712,
"step": 2920
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.002219531685113907,
"learning_rate": 3.238211465047166e-05,
"loss": 0.0,
"num_input_tokens_seen": 626784,
"step": 2925
},
{
"epoch": 9.301587301587302,
"grad_norm": 0.00024516129633411765,
"learning_rate": 3.231591561790696e-05,
"loss": 0.0,
"num_input_tokens_seen": 627872,
"step": 2930
},
{
"epoch": 9.317460317460318,
"grad_norm": 0.00031894820858724415,
"learning_rate": 3.224966043633966e-05,
"loss": 0.0014,
"num_input_tokens_seen": 628992,
"step": 2935
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.0001425828959327191,
"learning_rate": 3.2183349614272374e-05,
"loss": 0.0,
"num_input_tokens_seen": 630048,
"step": 2940
},
{
"epoch": 9.34920634920635,
"grad_norm": 0.00026056909700855613,
"learning_rate": 3.2116983660634787e-05,
"loss": 0.0004,
"num_input_tokens_seen": 631120,
"step": 2945
},
{
"epoch": 9.365079365079366,
"grad_norm": 0.00025601257220841944,
"learning_rate": 3.205056308477969e-05,
"loss": 0.0003,
"num_input_tokens_seen": 632128,
"step": 2950
},
{
"epoch": 9.380952380952381,
"grad_norm": 9.780770051293075e-05,
"learning_rate": 3.198408839647911e-05,
"loss": 0.0,
"num_input_tokens_seen": 633136,
"step": 2955
},
{
"epoch": 9.396825396825397,
"grad_norm": 0.00019243262067902833,
"learning_rate": 3.191756010592038e-05,
"loss": 0.0001,
"num_input_tokens_seen": 634208,
"step": 2960
},
{
"epoch": 9.412698412698413,
"grad_norm": 0.0024152263067662716,
"learning_rate": 3.185097872370221e-05,
"loss": 0.0,
"num_input_tokens_seen": 635312,
"step": 2965
},
{
"epoch": 9.428571428571429,
"grad_norm": 0.0002794755273498595,
"learning_rate": 3.17843447608308e-05,
"loss": 0.0,
"num_input_tokens_seen": 636336,
"step": 2970
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.10732641071081161,
"learning_rate": 3.17176587287159e-05,
"loss": 0.0028,
"num_input_tokens_seen": 637472,
"step": 2975
},
{
"epoch": 9.46031746031746,
"grad_norm": 0.0006334662321023643,
"learning_rate": 3.165092113916688e-05,
"loss": 0.0,
"num_input_tokens_seen": 638576,
"step": 2980
},
{
"epoch": 9.476190476190476,
"grad_norm": 0.000548696902114898,
"learning_rate": 3.158413250438882e-05,
"loss": 0.0,
"num_input_tokens_seen": 639584,
"step": 2985
},
{
"epoch": 9.492063492063492,
"grad_norm": 0.00021050528448540717,
"learning_rate": 3.151729333697854e-05,
"loss": 0.0,
"num_input_tokens_seen": 640640,
"step": 2990
},
{
"epoch": 9.507936507936508,
"grad_norm": 0.0002193010732298717,
"learning_rate": 3.1450404149920736e-05,
"loss": 0.0,
"num_input_tokens_seen": 641696,
"step": 2995
},
{
"epoch": 9.523809523809524,
"grad_norm": 0.7367925643920898,
"learning_rate": 3.138346545658397e-05,
"loss": 0.0007,
"num_input_tokens_seen": 642816,
"step": 3000
},
{
"epoch": 9.53968253968254,
"grad_norm": 0.11529196798801422,
"learning_rate": 3.131647777071677e-05,
"loss": 0.0137,
"num_input_tokens_seen": 643920,
"step": 3005
},
{
"epoch": 9.555555555555555,
"grad_norm": 8.780050120549276e-05,
"learning_rate": 3.1249441606443665e-05,
"loss": 0.0,
"num_input_tokens_seen": 644944,
"step": 3010
},
{
"epoch": 9.571428571428571,
"grad_norm": 0.14377683401107788,
"learning_rate": 3.1182357478261274e-05,
"loss": 0.0002,
"num_input_tokens_seen": 645968,
"step": 3015
},
{
"epoch": 9.587301587301587,
"grad_norm": 0.00015247806732077152,
"learning_rate": 3.111522590103432e-05,
"loss": 0.0251,
"num_input_tokens_seen": 647040,
"step": 3020
},
{
"epoch": 9.603174603174603,
"grad_norm": 0.00019754045933950692,
"learning_rate": 3.104804738999169e-05,
"loss": 0.0,
"num_input_tokens_seen": 648112,
"step": 3025
},
{
"epoch": 9.619047619047619,
"grad_norm": 0.00045227553346194327,
"learning_rate": 3.0980822460722504e-05,
"loss": 0.0001,
"num_input_tokens_seen": 649168,
"step": 3030
},
{
"epoch": 9.634920634920634,
"grad_norm": 0.14106620848178864,
"learning_rate": 3.091355162917211e-05,
"loss": 0.0,
"num_input_tokens_seen": 650192,
"step": 3035
},
{
"epoch": 9.65079365079365,
"grad_norm": 6.160133361816406,
"learning_rate": 3.084623541163817e-05,
"loss": 0.0301,
"num_input_tokens_seen": 651280,
"step": 3040
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.0001889690029202029,
"learning_rate": 3.0778874324766676e-05,
"loss": 0.0,
"num_input_tokens_seen": 652288,
"step": 3045
},
{
"epoch": 9.682539682539682,
"grad_norm": 0.001083326991647482,
"learning_rate": 3.071146888554799e-05,
"loss": 0.0003,
"num_input_tokens_seen": 653408,
"step": 3050
},
{
"epoch": 9.698412698412698,
"grad_norm": 0.002526765689253807,
"learning_rate": 3.0644019611312865e-05,
"loss": 0.0,
"num_input_tokens_seen": 654496,
"step": 3055
},
{
"epoch": 9.714285714285714,
"grad_norm": 0.0024127070792019367,
"learning_rate": 3.057652701972848e-05,
"loss": 0.0001,
"num_input_tokens_seen": 655632,
"step": 3060
},
{
"epoch": 9.73015873015873,
"grad_norm": 0.0006554791470989585,
"learning_rate": 3.050899162879451e-05,
"loss": 0.0202,
"num_input_tokens_seen": 656720,
"step": 3065
},
{
"epoch": 9.746031746031747,
"grad_norm": 0.008043559268116951,
"learning_rate": 3.044141395683906e-05,
"loss": 0.0008,
"num_input_tokens_seen": 657824,
"step": 3070
},
{
"epoch": 9.761904761904763,
"grad_norm": 0.00044315162813290954,
"learning_rate": 3.037379452251477e-05,
"loss": 0.0,
"num_input_tokens_seen": 658912,
"step": 3075
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.0003389069461263716,
"learning_rate": 3.0306133844794783e-05,
"loss": 0.0,
"num_input_tokens_seen": 660000,
"step": 3080
},
{
"epoch": 9.793650793650794,
"grad_norm": 0.0010802766773849726,
"learning_rate": 3.02384324429688e-05,
"loss": 0.0407,
"num_input_tokens_seen": 661040,
"step": 3085
},
{
"epoch": 9.80952380952381,
"grad_norm": 6.0305914878845215,
"learning_rate": 3.0170690836639065e-05,
"loss": 0.0093,
"num_input_tokens_seen": 662016,
"step": 3090
},
{
"epoch": 9.825396825396826,
"grad_norm": 0.0006042938912287354,
"learning_rate": 3.0102909545716396e-05,
"loss": 0.0004,
"num_input_tokens_seen": 663040,
"step": 3095
},
{
"epoch": 9.841269841269842,
"grad_norm": 0.00038514367770403624,
"learning_rate": 3.003508909041617e-05,
"loss": 0.0002,
"num_input_tokens_seen": 664096,
"step": 3100
},
{
"epoch": 9.857142857142858,
"grad_norm": 0.0013175939675420523,
"learning_rate": 2.9967229991254363e-05,
"loss": 0.0024,
"num_input_tokens_seen": 665104,
"step": 3105
},
{
"epoch": 9.873015873015873,
"grad_norm": 0.0002575514663476497,
"learning_rate": 2.989933276904353e-05,
"loss": 0.0005,
"num_input_tokens_seen": 666272,
"step": 3110
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.00040172351873479784,
"learning_rate": 2.9831397944888833e-05,
"loss": 0.0,
"num_input_tokens_seen": 667344,
"step": 3115
},
{
"epoch": 9.904761904761905,
"grad_norm": 0.0011199676664546132,
"learning_rate": 2.9763426040184007e-05,
"loss": 0.0001,
"num_input_tokens_seen": 668400,
"step": 3120
},
{
"epoch": 9.920634920634921,
"grad_norm": 0.0007416466833092272,
"learning_rate": 2.9695417576607376e-05,
"loss": 0.0012,
"num_input_tokens_seen": 669504,
"step": 3125
},
{
"epoch": 9.936507936507937,
"grad_norm": 0.00034860780579037964,
"learning_rate": 2.9627373076117863e-05,
"loss": 0.0008,
"num_input_tokens_seen": 670608,
"step": 3130
},
{
"epoch": 9.952380952380953,
"grad_norm": 0.0011542192660272121,
"learning_rate": 2.9559293060950977e-05,
"loss": 0.0103,
"num_input_tokens_seen": 671632,
"step": 3135
},
{
"epoch": 9.968253968253968,
"grad_norm": 0.00018505194748286158,
"learning_rate": 2.9491178053614776e-05,
"loss": 0.0218,
"num_input_tokens_seen": 672736,
"step": 3140
},
{
"epoch": 9.984126984126984,
"grad_norm": 0.0003018953138962388,
"learning_rate": 2.9423028576885893e-05,
"loss": 0.0001,
"num_input_tokens_seen": 673760,
"step": 3145
},
{
"epoch": 10.0,
"grad_norm": 0.0003000242286361754,
"learning_rate": 2.9354845153805505e-05,
"loss": 0.0,
"num_input_tokens_seen": 674784,
"step": 3150
},
{
"epoch": 10.0,
"eval_loss": 0.1815599650144577,
"eval_runtime": 1.4451,
"eval_samples_per_second": 48.439,
"eval_steps_per_second": 24.219,
"num_input_tokens_seen": 674784,
"step": 3150
},
{
"epoch": 10.015873015873016,
"grad_norm": 0.0002667237422429025,
"learning_rate": 2.928662830767534e-05,
"loss": 0.0,
"num_input_tokens_seen": 675840,
"step": 3155
},
{
"epoch": 10.031746031746032,
"grad_norm": 1.0574944019317627,
"learning_rate": 2.9218378562053623e-05,
"loss": 0.0006,
"num_input_tokens_seen": 676896,
"step": 3160
},
{
"epoch": 10.047619047619047,
"grad_norm": 0.00017189487698487937,
"learning_rate": 2.9150096440751107e-05,
"loss": 0.0,
"num_input_tokens_seen": 677952,
"step": 3165
},
{
"epoch": 10.063492063492063,
"grad_norm": 0.0003118932945653796,
"learning_rate": 2.908178246782698e-05,
"loss": 0.0,
"num_input_tokens_seen": 678976,
"step": 3170
},
{
"epoch": 10.079365079365079,
"grad_norm": 0.23004379868507385,
"learning_rate": 2.9013437167584944e-05,
"loss": 0.0002,
"num_input_tokens_seen": 680016,
"step": 3175
},
{
"epoch": 10.095238095238095,
"grad_norm": 0.0002921000123023987,
"learning_rate": 2.894506106456909e-05,
"loss": 0.0001,
"num_input_tokens_seen": 681088,
"step": 3180
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.05020623281598091,
"learning_rate": 2.8876654683559944e-05,
"loss": 0.0008,
"num_input_tokens_seen": 682160,
"step": 3185
},
{
"epoch": 10.126984126984127,
"grad_norm": 0.0664471909403801,
"learning_rate": 2.8808218549570408e-05,
"loss": 0.0002,
"num_input_tokens_seen": 683232,
"step": 3190
},
{
"epoch": 10.142857142857142,
"grad_norm": 0.05454224720597267,
"learning_rate": 2.8739753187841733e-05,
"loss": 0.0001,
"num_input_tokens_seen": 684304,
"step": 3195
},
{
"epoch": 10.158730158730158,
"grad_norm": 0.004288592375814915,
"learning_rate": 2.8671259123839472e-05,
"loss": 0.0078,
"num_input_tokens_seen": 685440,
"step": 3200
},
{
"epoch": 10.174603174603174,
"grad_norm": 0.0005515673547051847,
"learning_rate": 2.8602736883249503e-05,
"loss": 0.0001,
"num_input_tokens_seen": 686576,
"step": 3205
},
{
"epoch": 10.19047619047619,
"grad_norm": 0.005215165205299854,
"learning_rate": 2.8534186991973932e-05,
"loss": 0.0,
"num_input_tokens_seen": 687632,
"step": 3210
},
{
"epoch": 10.206349206349206,
"grad_norm": 0.01046276930719614,
"learning_rate": 2.8465609976127082e-05,
"loss": 0.0,
"num_input_tokens_seen": 688704,
"step": 3215
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.000193351210327819,
"learning_rate": 2.839700636203146e-05,
"loss": 0.0,
"num_input_tokens_seen": 689776,
"step": 3220
},
{
"epoch": 10.238095238095237,
"grad_norm": 0.014234524220228195,
"learning_rate": 2.8328376676213713e-05,
"loss": 0.0029,
"num_input_tokens_seen": 690864,
"step": 3225
},
{
"epoch": 10.253968253968253,
"grad_norm": 0.0002087104512611404,
"learning_rate": 2.8259721445400577e-05,
"loss": 0.0,
"num_input_tokens_seen": 691904,
"step": 3230
},
{
"epoch": 10.26984126984127,
"grad_norm": 0.006072845309972763,
"learning_rate": 2.8191041196514873e-05,
"loss": 0.0,
"num_input_tokens_seen": 692992,
"step": 3235
},
{
"epoch": 10.285714285714286,
"grad_norm": 0.059259865432977676,
"learning_rate": 2.8122336456671378e-05,
"loss": 0.0002,
"num_input_tokens_seen": 694016,
"step": 3240
},
{
"epoch": 10.301587301587302,
"grad_norm": 8.876676559448242,
"learning_rate": 2.8053607753172895e-05,
"loss": 0.0183,
"num_input_tokens_seen": 695152,
"step": 3245
},
{
"epoch": 10.317460317460318,
"grad_norm": 0.0069928658194839954,
"learning_rate": 2.7984855613506107e-05,
"loss": 0.0,
"num_input_tokens_seen": 696176,
"step": 3250
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.00021716665651183575,
"learning_rate": 2.791608056533759e-05,
"loss": 0.0002,
"num_input_tokens_seen": 697312,
"step": 3255
},
{
"epoch": 10.34920634920635,
"grad_norm": 0.00015753868501633406,
"learning_rate": 2.7847283136509717e-05,
"loss": 0.0002,
"num_input_tokens_seen": 698336,
"step": 3260
},
{
"epoch": 10.365079365079366,
"grad_norm": 0.00016610305465292186,
"learning_rate": 2.7778463855036657e-05,
"loss": 0.0001,
"num_input_tokens_seen": 699488,
"step": 3265
},
{
"epoch": 10.380952380952381,
"grad_norm": 0.00039039889816194773,
"learning_rate": 2.770962324910027e-05,
"loss": 0.0,
"num_input_tokens_seen": 700512,
"step": 3270
},
{
"epoch": 10.396825396825397,
"grad_norm": 0.00014512175403069705,
"learning_rate": 2.7640761847046105e-05,
"loss": 0.005,
"num_input_tokens_seen": 701552,
"step": 3275
},
{
"epoch": 10.412698412698413,
"grad_norm": 0.0017348774708807468,
"learning_rate": 2.75718801773793e-05,
"loss": 0.0,
"num_input_tokens_seen": 702672,
"step": 3280
},
{
"epoch": 10.428571428571429,
"grad_norm": 0.00017332640709355474,
"learning_rate": 2.750297876876055e-05,
"loss": 0.0001,
"num_input_tokens_seen": 703712,
"step": 3285
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.00014672847464680672,
"learning_rate": 2.743405815000205e-05,
"loss": 0.0309,
"num_input_tokens_seen": 704816,
"step": 3290
},
{
"epoch": 10.46031746031746,
"grad_norm": 0.0005181178566999733,
"learning_rate": 2.736511885006343e-05,
"loss": 0.0,
"num_input_tokens_seen": 705904,
"step": 3295
},
{
"epoch": 10.476190476190476,
"grad_norm": 0.0001759826991474256,
"learning_rate": 2.729616139804769e-05,
"loss": 0.0,
"num_input_tokens_seen": 706944,
"step": 3300
},
{
"epoch": 10.492063492063492,
"grad_norm": 0.004806222394108772,
"learning_rate": 2.7227186323197162e-05,
"loss": 0.0,
"num_input_tokens_seen": 708048,
"step": 3305
},
{
"epoch": 10.507936507936508,
"grad_norm": 0.00019694813818205148,
"learning_rate": 2.7158194154889394e-05,
"loss": 0.0,
"num_input_tokens_seen": 709040,
"step": 3310
},
{
"epoch": 10.523809523809524,
"grad_norm": 0.005902845412492752,
"learning_rate": 2.7089185422633178e-05,
"loss": 0.0094,
"num_input_tokens_seen": 710112,
"step": 3315
},
{
"epoch": 10.53968253968254,
"grad_norm": 0.0002289386175107211,
"learning_rate": 2.7020160656064382e-05,
"loss": 0.0042,
"num_input_tokens_seen": 711120,
"step": 3320
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.00042782543459907174,
"learning_rate": 2.695112038494198e-05,
"loss": 0.0,
"num_input_tokens_seen": 712272,
"step": 3325
},
{
"epoch": 10.571428571428571,
"grad_norm": 0.014789941720664501,
"learning_rate": 2.6882065139143907e-05,
"loss": 0.0001,
"num_input_tokens_seen": 713360,
"step": 3330
},
{
"epoch": 10.587301587301587,
"grad_norm": 1.8601882457733154,
"learning_rate": 2.6812995448663047e-05,
"loss": 0.0011,
"num_input_tokens_seen": 714496,
"step": 3335
},
{
"epoch": 10.603174603174603,
"grad_norm": 0.0001709021016722545,
"learning_rate": 2.674391184360313e-05,
"loss": 0.0001,
"num_input_tokens_seen": 715568,
"step": 3340
},
{
"epoch": 10.619047619047619,
"grad_norm": 0.0006503509357571602,
"learning_rate": 2.6674814854174708e-05,
"loss": 0.0,
"num_input_tokens_seen": 716688,
"step": 3345
},
{
"epoch": 10.634920634920634,
"grad_norm": 0.0003370628983248025,
"learning_rate": 2.6605705010691025e-05,
"loss": 0.0,
"num_input_tokens_seen": 717664,
"step": 3350
},
{
"epoch": 10.65079365079365,
"grad_norm": 11.525466918945312,
"learning_rate": 2.6536582843563995e-05,
"loss": 0.0642,
"num_input_tokens_seen": 718784,
"step": 3355
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.011019658297300339,
"learning_rate": 2.6467448883300104e-05,
"loss": 0.0002,
"num_input_tokens_seen": 719840,
"step": 3360
},
{
"epoch": 10.682539682539682,
"grad_norm": 0.0003383358125574887,
"learning_rate": 2.6398303660496376e-05,
"loss": 0.0002,
"num_input_tokens_seen": 720960,
"step": 3365
},
{
"epoch": 10.698412698412698,
"grad_norm": 0.0012447485933080316,
"learning_rate": 2.6329147705836238e-05,
"loss": 0.0,
"num_input_tokens_seen": 722064,
"step": 3370
},
{
"epoch": 10.714285714285714,
"grad_norm": 0.00022383588657248765,
"learning_rate": 2.6259981550085504e-05,
"loss": 0.0,
"num_input_tokens_seen": 723152,
"step": 3375
},
{
"epoch": 10.73015873015873,
"grad_norm": 0.00043943486525677145,
"learning_rate": 2.6190805724088274e-05,
"loss": 0.0,
"num_input_tokens_seen": 724208,
"step": 3380
},
{
"epoch": 10.746031746031747,
"grad_norm": 0.019387539476156235,
"learning_rate": 2.6121620758762877e-05,
"loss": 0.0001,
"num_input_tokens_seen": 725296,
"step": 3385
},
{
"epoch": 10.761904761904763,
"grad_norm": 0.026385366916656494,
"learning_rate": 2.6052427185097765e-05,
"loss": 0.0,
"num_input_tokens_seen": 726384,
"step": 3390
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.0005208961665630341,
"learning_rate": 2.598322553414749e-05,
"loss": 0.0,
"num_input_tokens_seen": 727424,
"step": 3395
},
{
"epoch": 10.793650793650794,
"grad_norm": 0.0025588928256183863,
"learning_rate": 2.591401633702856e-05,
"loss": 0.001,
"num_input_tokens_seen": 728528,
"step": 3400
},
{
"epoch": 10.80952380952381,
"grad_norm": 0.0004624544526450336,
"learning_rate": 2.584480012491542e-05,
"loss": 0.0,
"num_input_tokens_seen": 729616,
"step": 3405
},
{
"epoch": 10.825396825396826,
"grad_norm": 0.016153214499354362,
"learning_rate": 2.5775577429036345e-05,
"loss": 0.0001,
"num_input_tokens_seen": 730640,
"step": 3410
},
{
"epoch": 10.841269841269842,
"grad_norm": 0.00031025870703160763,
"learning_rate": 2.5706348780669393e-05,
"loss": 0.0,
"num_input_tokens_seen": 731712,
"step": 3415
},
{
"epoch": 10.857142857142858,
"grad_norm": 0.0006250953883863986,
"learning_rate": 2.5637114711138282e-05,
"loss": 0.0,
"num_input_tokens_seen": 732720,
"step": 3420
},
{
"epoch": 10.873015873015873,
"grad_norm": 0.013909174129366875,
"learning_rate": 2.5567875751808353e-05,
"loss": 0.0001,
"num_input_tokens_seen": 733792,
"step": 3425
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.0007239268743433058,
"learning_rate": 2.5498632434082452e-05,
"loss": 0.0002,
"num_input_tokens_seen": 734880,
"step": 3430
},
{
"epoch": 10.904761904761905,
"grad_norm": 0.0003793005016632378,
"learning_rate": 2.542938528939691e-05,
"loss": 0.0014,
"num_input_tokens_seen": 735936,
"step": 3435
},
{
"epoch": 10.920634920634921,
"grad_norm": 0.00018004873709287494,
"learning_rate": 2.5360134849217416e-05,
"loss": 0.0,
"num_input_tokens_seen": 736976,
"step": 3440
},
{
"epoch": 10.936507936507937,
"grad_norm": 0.0006164236110635102,
"learning_rate": 2.5290881645034932e-05,
"loss": 0.0292,
"num_input_tokens_seen": 738064,
"step": 3445
},
{
"epoch": 10.952380952380953,
"grad_norm": 0.0004222550487611443,
"learning_rate": 2.5221626208361655e-05,
"loss": 0.0,
"num_input_tokens_seen": 739152,
"step": 3450
},
{
"epoch": 10.968253968253968,
"grad_norm": 0.0003485404886305332,
"learning_rate": 2.515236907072691e-05,
"loss": 0.0,
"num_input_tokens_seen": 740240,
"step": 3455
},
{
"epoch": 10.984126984126984,
"grad_norm": 0.0003257024218328297,
"learning_rate": 2.5083110763673085e-05,
"loss": 0.0008,
"num_input_tokens_seen": 741328,
"step": 3460
},
{
"epoch": 11.0,
"grad_norm": 0.0008957489626482129,
"learning_rate": 2.5013851818751534e-05,
"loss": 0.0001,
"num_input_tokens_seen": 742336,
"step": 3465
},
{
"epoch": 11.0,
"eval_loss": 0.19778476655483246,
"eval_runtime": 1.4538,
"eval_samples_per_second": 48.149,
"eval_steps_per_second": 24.075,
"num_input_tokens_seen": 742336,
"step": 3465
},
{
"epoch": 11.015873015873016,
"grad_norm": 0.00035931816091760993,
"learning_rate": 2.4944592767518495e-05,
"loss": 0.0001,
"num_input_tokens_seen": 743456,
"step": 3470
},
{
"epoch": 11.031746031746032,
"grad_norm": 0.00027192573179490864,
"learning_rate": 2.4875334141531052e-05,
"loss": 0.0,
"num_input_tokens_seen": 744528,
"step": 3475
},
{
"epoch": 11.047619047619047,
"grad_norm": 0.004379758145660162,
"learning_rate": 2.4806076472342997e-05,
"loss": 0.0,
"num_input_tokens_seen": 745520,
"step": 3480
},
{
"epoch": 11.063492063492063,
"grad_norm": 0.0010909591801464558,
"learning_rate": 2.4736820291500793e-05,
"loss": 0.0,
"num_input_tokens_seen": 746592,
"step": 3485
},
{
"epoch": 11.079365079365079,
"grad_norm": 0.0003718891239259392,
"learning_rate": 2.466756613053948e-05,
"loss": 0.0,
"num_input_tokens_seen": 747696,
"step": 3490
},
{
"epoch": 11.095238095238095,
"grad_norm": 18.98329734802246,
"learning_rate": 2.459831452097859e-05,
"loss": 0.0252,
"num_input_tokens_seen": 748816,
"step": 3495
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.003277825890108943,
"learning_rate": 2.4529065994318078e-05,
"loss": 0.0,
"num_input_tokens_seen": 749840,
"step": 3500
},
{
"epoch": 11.126984126984127,
"grad_norm": 0.013009368441998959,
"learning_rate": 2.445982108203422e-05,
"loss": 0.0003,
"num_input_tokens_seen": 750944,
"step": 3505
},
{
"epoch": 11.142857142857142,
"grad_norm": 0.03715138137340546,
"learning_rate": 2.43905803155756e-05,
"loss": 0.0001,
"num_input_tokens_seen": 752000,
"step": 3510
},
{
"epoch": 11.158730158730158,
"grad_norm": 0.0052580940537154675,
"learning_rate": 2.432134422635893e-05,
"loss": 0.0,
"num_input_tokens_seen": 753152,
"step": 3515
},
{
"epoch": 11.174603174603174,
"grad_norm": 0.0003532212576828897,
"learning_rate": 2.4252113345765046e-05,
"loss": 0.0,
"num_input_tokens_seen": 754224,
"step": 3520
},
{
"epoch": 11.19047619047619,
"grad_norm": 0.0006754444329999387,
"learning_rate": 2.4182888205134797e-05,
"loss": 0.0,
"num_input_tokens_seen": 755312,
"step": 3525
},
{
"epoch": 11.206349206349206,
"grad_norm": 0.00020079000387340784,
"learning_rate": 2.4113669335765017e-05,
"loss": 0.0,
"num_input_tokens_seen": 756336,
"step": 3530
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.00023593910736963153,
"learning_rate": 2.404445726890437e-05,
"loss": 0.0,
"num_input_tokens_seen": 757360,
"step": 3535
},
{
"epoch": 11.238095238095237,
"grad_norm": 0.0003324486897327006,
"learning_rate": 2.397525253574931e-05,
"loss": 0.0,
"num_input_tokens_seen": 758400,
"step": 3540
},
{
"epoch": 11.253968253968253,
"grad_norm": 0.003988176584243774,
"learning_rate": 2.390605566744002e-05,
"loss": 0.0003,
"num_input_tokens_seen": 759456,
"step": 3545
},
{
"epoch": 11.26984126984127,
"grad_norm": 0.0004338165163062513,
"learning_rate": 2.3836867195056335e-05,
"loss": 0.0,
"num_input_tokens_seen": 760480,
"step": 3550
},
{
"epoch": 11.285714285714286,
"grad_norm": 0.0001763407635735348,
"learning_rate": 2.376768764961362e-05,
"loss": 0.0,
"num_input_tokens_seen": 761552,
"step": 3555
},
{
"epoch": 11.301587301587302,
"grad_norm": 0.0491175577044487,
"learning_rate": 2.3698517562058758e-05,
"loss": 0.0001,
"num_input_tokens_seen": 762624,
"step": 3560
},
{
"epoch": 11.317460317460318,
"grad_norm": 0.00014863189426250756,
"learning_rate": 2.3629357463265995e-05,
"loss": 0.0,
"num_input_tokens_seen": 763696,
"step": 3565
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.004388798493891954,
"learning_rate": 2.3560207884032987e-05,
"loss": 0.0,
"num_input_tokens_seen": 764800,
"step": 3570
},
{
"epoch": 11.34920634920635,
"grad_norm": 0.0027450949419289827,
"learning_rate": 2.349106935507659e-05,
"loss": 0.0,
"num_input_tokens_seen": 765872,
"step": 3575
},
{
"epoch": 11.365079365079366,
"grad_norm": 0.0010159960947930813,
"learning_rate": 2.342194240702888e-05,
"loss": 0.0,
"num_input_tokens_seen": 766928,
"step": 3580
},
{
"epoch": 11.380952380952381,
"grad_norm": 0.0001987464347621426,
"learning_rate": 2.3352827570433036e-05,
"loss": 0.0001,
"num_input_tokens_seen": 768064,
"step": 3585
},
{
"epoch": 11.396825396825397,
"grad_norm": 0.001201266422867775,
"learning_rate": 2.3283725375739303e-05,
"loss": 0.0001,
"num_input_tokens_seen": 769168,
"step": 3590
},
{
"epoch": 11.412698412698413,
"grad_norm": 0.03675874322652817,
"learning_rate": 2.321463635330088e-05,
"loss": 0.0001,
"num_input_tokens_seen": 770224,
"step": 3595
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.000684223894495517,
"learning_rate": 2.3145561033369877e-05,
"loss": 0.0,
"num_input_tokens_seen": 771296,
"step": 3600
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.0001652841456234455,
"learning_rate": 2.3076499946093243e-05,
"loss": 0.0,
"num_input_tokens_seen": 772400,
"step": 3605
},
{
"epoch": 11.46031746031746,
"grad_norm": 0.02017052099108696,
"learning_rate": 2.300745362150869e-05,
"loss": 0.0,
"num_input_tokens_seen": 773456,
"step": 3610
},
{
"epoch": 11.476190476190476,
"grad_norm": 0.00025198451476171613,
"learning_rate": 2.2938422589540627e-05,
"loss": 0.0,
"num_input_tokens_seen": 774432,
"step": 3615
},
{
"epoch": 11.492063492063492,
"grad_norm": 0.014424529857933521,
"learning_rate": 2.2869407379996088e-05,
"loss": 0.0071,
"num_input_tokens_seen": 775520,
"step": 3620
},
{
"epoch": 11.507936507936508,
"grad_norm": 0.0003219831851311028,
"learning_rate": 2.2800408522560678e-05,
"loss": 0.0,
"num_input_tokens_seen": 776544,
"step": 3625
},
{
"epoch": 11.523809523809524,
"grad_norm": 0.0002450917090754956,
"learning_rate": 2.2731426546794508e-05,
"loss": 0.0,
"num_input_tokens_seen": 777728,
"step": 3630
},
{
"epoch": 11.53968253968254,
"grad_norm": 0.00040673837065696716,
"learning_rate": 2.2662461982128108e-05,
"loss": 0.0198,
"num_input_tokens_seen": 778784,
"step": 3635
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.00013897109602112323,
"learning_rate": 2.259351535785839e-05,
"loss": 0.0,
"num_input_tokens_seen": 779888,
"step": 3640
},
{
"epoch": 11.571428571428571,
"grad_norm": 0.000246451236307621,
"learning_rate": 2.2524587203144565e-05,
"loss": 0.0178,
"num_input_tokens_seen": 780960,
"step": 3645
},
{
"epoch": 11.587301587301587,
"grad_norm": 0.00041408365359529853,
"learning_rate": 2.2455678047004107e-05,
"loss": 0.0,
"num_input_tokens_seen": 782048,
"step": 3650
},
{
"epoch": 11.603174603174603,
"grad_norm": 0.0011768187396228313,
"learning_rate": 2.238678841830867e-05,
"loss": 0.0,
"num_input_tokens_seen": 783104,
"step": 3655
},
{
"epoch": 11.619047619047619,
"grad_norm": 0.01867389678955078,
"learning_rate": 2.2317918845780027e-05,
"loss": 0.0,
"num_input_tokens_seen": 784160,
"step": 3660
},
{
"epoch": 11.634920634920634,
"grad_norm": 0.00020991513156332076,
"learning_rate": 2.2249069857986027e-05,
"loss": 0.0,
"num_input_tokens_seen": 785264,
"step": 3665
},
{
"epoch": 11.65079365079365,
"grad_norm": 0.00016364398470614105,
"learning_rate": 2.218024198333656e-05,
"loss": 0.0,
"num_input_tokens_seen": 786288,
"step": 3670
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.0001746386114973575,
"learning_rate": 2.2111435750079434e-05,
"loss": 0.0,
"num_input_tokens_seen": 787360,
"step": 3675
},
{
"epoch": 11.682539682539682,
"grad_norm": 0.0014488694723695517,
"learning_rate": 2.2042651686296378e-05,
"loss": 0.0035,
"num_input_tokens_seen": 788400,
"step": 3680
},
{
"epoch": 11.698412698412698,
"grad_norm": 0.0005274708964861929,
"learning_rate": 2.1973890319898963e-05,
"loss": 0.0,
"num_input_tokens_seen": 789472,
"step": 3685
},
{
"epoch": 11.714285714285714,
"grad_norm": 0.0021408062893897295,
"learning_rate": 2.1905152178624595e-05,
"loss": 0.002,
"num_input_tokens_seen": 790512,
"step": 3690
},
{
"epoch": 11.73015873015873,
"grad_norm": 0.001208798261359334,
"learning_rate": 2.183643779003239e-05,
"loss": 0.0,
"num_input_tokens_seen": 791600,
"step": 3695
},
{
"epoch": 11.746031746031747,
"grad_norm": 0.0007551803719252348,
"learning_rate": 2.1767747681499176e-05,
"loss": 0.0,
"num_input_tokens_seen": 792704,
"step": 3700
},
{
"epoch": 11.761904761904763,
"grad_norm": 0.00040706252912059426,
"learning_rate": 2.1699082380215425e-05,
"loss": 0.0,
"num_input_tokens_seen": 793792,
"step": 3705
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.0019093899754807353,
"learning_rate": 2.1630442413181246e-05,
"loss": 0.0001,
"num_input_tokens_seen": 794944,
"step": 3710
},
{
"epoch": 11.793650793650794,
"grad_norm": 0.0044355797581374645,
"learning_rate": 2.156182830720228e-05,
"loss": 0.0,
"num_input_tokens_seen": 795968,
"step": 3715
},
{
"epoch": 11.80952380952381,
"grad_norm": 0.0013154816115275025,
"learning_rate": 2.14932405888857e-05,
"loss": 0.0,
"num_input_tokens_seen": 797040,
"step": 3720
},
{
"epoch": 11.825396825396826,
"grad_norm": 0.05848317593336105,
"learning_rate": 2.1424679784636144e-05,
"loss": 0.0001,
"num_input_tokens_seen": 798064,
"step": 3725
},
{
"epoch": 11.841269841269842,
"grad_norm": 0.00017026295245159417,
"learning_rate": 2.1356146420651706e-05,
"loss": 0.0,
"num_input_tokens_seen": 799104,
"step": 3730
},
{
"epoch": 11.857142857142858,
"grad_norm": 0.0005860523087903857,
"learning_rate": 2.1287641022919866e-05,
"loss": 0.0,
"num_input_tokens_seen": 800240,
"step": 3735
},
{
"epoch": 11.873015873015873,
"grad_norm": 0.0011898577213287354,
"learning_rate": 2.121916411721346e-05,
"loss": 0.0344,
"num_input_tokens_seen": 801312,
"step": 3740
},
{
"epoch": 11.88888888888889,
"grad_norm": 1.2069417238235474,
"learning_rate": 2.115071622908666e-05,
"loss": 0.001,
"num_input_tokens_seen": 802352,
"step": 3745
},
{
"epoch": 11.904761904761905,
"grad_norm": 0.00797231961041689,
"learning_rate": 2.1082297883870937e-05,
"loss": 0.0,
"num_input_tokens_seen": 803424,
"step": 3750
},
{
"epoch": 11.920634920634921,
"grad_norm": 0.0008888400625437498,
"learning_rate": 2.1013909606671004e-05,
"loss": 0.0006,
"num_input_tokens_seen": 804512,
"step": 3755
},
{
"epoch": 11.936507936507937,
"grad_norm": 0.0006856803665868938,
"learning_rate": 2.0945551922360818e-05,
"loss": 0.0,
"num_input_tokens_seen": 805584,
"step": 3760
},
{
"epoch": 11.952380952380953,
"grad_norm": 3.1141719818115234,
"learning_rate": 2.087722535557953e-05,
"loss": 0.0026,
"num_input_tokens_seen": 806608,
"step": 3765
},
{
"epoch": 11.968253968253968,
"grad_norm": 0.00021470033971127123,
"learning_rate": 2.0808930430727484e-05,
"loss": 0.0326,
"num_input_tokens_seen": 807680,
"step": 3770
},
{
"epoch": 11.984126984126984,
"grad_norm": 0.00046644502435810864,
"learning_rate": 2.0740667671962156e-05,
"loss": 0.0002,
"num_input_tokens_seen": 808720,
"step": 3775
},
{
"epoch": 12.0,
"grad_norm": 0.0005882336990907788,
"learning_rate": 2.067243760319415e-05,
"loss": 0.0,
"num_input_tokens_seen": 809792,
"step": 3780
},
{
"epoch": 12.0,
"eval_loss": 0.17750194668769836,
"eval_runtime": 1.4455,
"eval_samples_per_second": 48.426,
"eval_steps_per_second": 24.213,
"num_input_tokens_seen": 809792,
"step": 3780
},
{
"epoch": 12.015873015873016,
"grad_norm": 0.002724026096984744,
"learning_rate": 2.060424074808319e-05,
"loss": 0.0,
"num_input_tokens_seen": 810880,
"step": 3785
},
{
"epoch": 12.031746031746032,
"grad_norm": 0.002534243743866682,
"learning_rate": 2.0536077630034086e-05,
"loss": 0.0,
"num_input_tokens_seen": 811920,
"step": 3790
},
{
"epoch": 12.047619047619047,
"grad_norm": 0.051372405141592026,
"learning_rate": 2.0467948772192713e-05,
"loss": 0.0001,
"num_input_tokens_seen": 812976,
"step": 3795
},
{
"epoch": 12.063492063492063,
"grad_norm": 0.0038024040404707193,
"learning_rate": 2.0399854697442e-05,
"loss": 0.007,
"num_input_tokens_seen": 814112,
"step": 3800
},
{
"epoch": 12.079365079365079,
"grad_norm": 0.1364995837211609,
"learning_rate": 2.0331795928397916e-05,
"loss": 0.0001,
"num_input_tokens_seen": 815152,
"step": 3805
},
{
"epoch": 12.095238095238095,
"grad_norm": 0.034437455236911774,
"learning_rate": 2.0263772987405494e-05,
"loss": 0.0,
"num_input_tokens_seen": 816224,
"step": 3810
},
{
"epoch": 12.11111111111111,
"grad_norm": 0.00039293619920499623,
"learning_rate": 2.0195786396534743e-05,
"loss": 0.0,
"num_input_tokens_seen": 817328,
"step": 3815
},
{
"epoch": 12.126984126984127,
"grad_norm": 0.00905533879995346,
"learning_rate": 2.0127836677576717e-05,
"loss": 0.0,
"num_input_tokens_seen": 818416,
"step": 3820
},
{
"epoch": 12.142857142857142,
"grad_norm": 0.001928988378494978,
"learning_rate": 2.0059924352039463e-05,
"loss": 0.0,
"num_input_tokens_seen": 819536,
"step": 3825
},
{
"epoch": 12.158730158730158,
"grad_norm": 0.000600858882535249,
"learning_rate": 1.9992049941144066e-05,
"loss": 0.0003,
"num_input_tokens_seen": 820608,
"step": 3830
},
{
"epoch": 12.174603174603174,
"grad_norm": 0.0003672520397230983,
"learning_rate": 1.99242139658206e-05,
"loss": 0.0,
"num_input_tokens_seen": 821664,
"step": 3835
},
{
"epoch": 12.19047619047619,
"grad_norm": 0.00022892758715897799,
"learning_rate": 1.985641694670414e-05,
"loss": 0.0,
"num_input_tokens_seen": 822768,
"step": 3840
},
{
"epoch": 12.206349206349206,
"grad_norm": 0.00028045850922353566,
"learning_rate": 1.9788659404130776e-05,
"loss": 0.0,
"num_input_tokens_seen": 823792,
"step": 3845
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.00015690652071498334,
"learning_rate": 1.9720941858133658e-05,
"loss": 0.0,
"num_input_tokens_seen": 824928,
"step": 3850
},
{
"epoch": 12.238095238095237,
"grad_norm": 0.00023055235214997083,
"learning_rate": 1.9653264828438923e-05,
"loss": 0.0,
"num_input_tokens_seen": 825952,
"step": 3855
},
{
"epoch": 12.253968253968253,
"grad_norm": 0.00021435992675833404,
"learning_rate": 1.9585628834461766e-05,
"loss": 0.0,
"num_input_tokens_seen": 826960,
"step": 3860
},
{
"epoch": 12.26984126984127,
"grad_norm": 0.0001954881736310199,
"learning_rate": 1.9518034395302414e-05,
"loss": 0.0,
"num_input_tokens_seen": 827968,
"step": 3865
},
{
"epoch": 12.285714285714286,
"grad_norm": 0.0007147770957089961,
"learning_rate": 1.9450482029742217e-05,
"loss": 0.0,
"num_input_tokens_seen": 829040,
"step": 3870
},
{
"epoch": 12.301587301587302,
"grad_norm": 0.4170708954334259,
"learning_rate": 1.9382972256239563e-05,
"loss": 0.0005,
"num_input_tokens_seen": 830080,
"step": 3875
},
{
"epoch": 12.317460317460318,
"grad_norm": 0.000531713361851871,
"learning_rate": 1.931550559292597e-05,
"loss": 0.0001,
"num_input_tokens_seen": 831104,
"step": 3880
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.000578387756831944,
"learning_rate": 1.9248082557602078e-05,
"loss": 0.0,
"num_input_tokens_seen": 832144,
"step": 3885
},
{
"epoch": 12.34920634920635,
"grad_norm": 0.00023974425857886672,
"learning_rate": 1.9180703667733713e-05,
"loss": 0.0,
"num_input_tokens_seen": 833216,
"step": 3890
},
{
"epoch": 12.365079365079366,
"grad_norm": 0.013734663836658001,
"learning_rate": 1.911336944044786e-05,
"loss": 0.0,
"num_input_tokens_seen": 834304,
"step": 3895
},
{
"epoch": 12.380952380952381,
"grad_norm": 0.00040066608926281333,
"learning_rate": 1.9046080392528735e-05,
"loss": 0.0002,
"num_input_tokens_seen": 835424,
"step": 3900
},
{
"epoch": 12.396825396825397,
"grad_norm": 0.0012625795789062977,
"learning_rate": 1.89788370404138e-05,
"loss": 0.0001,
"num_input_tokens_seen": 836496,
"step": 3905
},
{
"epoch": 12.412698412698413,
"grad_norm": 0.0016813823021948338,
"learning_rate": 1.8911639900189818e-05,
"loss": 0.0,
"num_input_tokens_seen": 837568,
"step": 3910
},
{
"epoch": 12.428571428571429,
"grad_norm": 0.00017749129619915038,
"learning_rate": 1.8844489487588867e-05,
"loss": 0.0,
"num_input_tokens_seen": 838640,
"step": 3915
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.00817769207060337,
"learning_rate": 1.8777386317984404e-05,
"loss": 0.0,
"num_input_tokens_seen": 839696,
"step": 3920
},
{
"epoch": 12.46031746031746,
"grad_norm": 0.00019275283557362854,
"learning_rate": 1.871033090638729e-05,
"loss": 0.0,
"num_input_tokens_seen": 840704,
"step": 3925
},
{
"epoch": 12.476190476190476,
"grad_norm": 0.0051796757616102695,
"learning_rate": 1.864332376744186e-05,
"loss": 0.0,
"num_input_tokens_seen": 841792,
"step": 3930
},
{
"epoch": 12.492063492063492,
"grad_norm": 0.00040796739631332457,
"learning_rate": 1.857636541542195e-05,
"loss": 0.0529,
"num_input_tokens_seen": 842896,
"step": 3935
},
{
"epoch": 12.507936507936508,
"grad_norm": 0.00011765657836804166,
"learning_rate": 1.850945636422697e-05,
"loss": 0.0,
"num_input_tokens_seen": 843984,
"step": 3940
},
{
"epoch": 12.523809523809524,
"grad_norm": 0.00013000769831705838,
"learning_rate": 1.844259712737793e-05,
"loss": 0.0,
"num_input_tokens_seen": 845056,
"step": 3945
},
{
"epoch": 12.53968253968254,
"grad_norm": 0.00014240505697671324,
"learning_rate": 1.8375788218013556e-05,
"loss": 0.0,
"num_input_tokens_seen": 846128,
"step": 3950
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.0013110644649714231,
"learning_rate": 1.8309030148886284e-05,
"loss": 0.0,
"num_input_tokens_seen": 847152,
"step": 3955
},
{
"epoch": 12.571428571428571,
"grad_norm": 0.02542807348072529,
"learning_rate": 1.8242323432358365e-05,
"loss": 0.0001,
"num_input_tokens_seen": 848256,
"step": 3960
},
{
"epoch": 12.587301587301587,
"grad_norm": 0.009853278286755085,
"learning_rate": 1.8175668580397914e-05,
"loss": 0.0,
"num_input_tokens_seen": 849328,
"step": 3965
},
{
"epoch": 12.603174603174603,
"grad_norm": 0.0005432798061519861,
"learning_rate": 1.8109066104575023e-05,
"loss": 0.0,
"num_input_tokens_seen": 850400,
"step": 3970
},
{
"epoch": 12.619047619047619,
"grad_norm": 0.002314529847353697,
"learning_rate": 1.8042516516057763e-05,
"loss": 0.0,
"num_input_tokens_seen": 851504,
"step": 3975
},
{
"epoch": 12.634920634920634,
"grad_norm": 0.0008332771249115467,
"learning_rate": 1.7976020325608318e-05,
"loss": 0.0,
"num_input_tokens_seen": 852560,
"step": 3980
},
{
"epoch": 12.65079365079365,
"grad_norm": 0.013491770252585411,
"learning_rate": 1.7909578043579037e-05,
"loss": 0.0001,
"num_input_tokens_seen": 853632,
"step": 3985
},
{
"epoch": 12.666666666666666,
"grad_norm": 0.00032894726609811187,
"learning_rate": 1.784319017990855e-05,
"loss": 0.0028,
"num_input_tokens_seen": 854720,
"step": 3990
},
{
"epoch": 12.682539682539682,
"grad_norm": 1.5756915807724,
"learning_rate": 1.7776857244117807e-05,
"loss": 0.0007,
"num_input_tokens_seen": 855792,
"step": 3995
},
{
"epoch": 12.698412698412698,
"grad_norm": 0.00010613162157824263,
"learning_rate": 1.7710579745306193e-05,
"loss": 0.0,
"num_input_tokens_seen": 856896,
"step": 4000
},
{
"epoch": 12.714285714285714,
"grad_norm": 9.347883315058425e-05,
"learning_rate": 1.764435819214762e-05,
"loss": 0.0,
"num_input_tokens_seen": 857968,
"step": 4005
},
{
"epoch": 12.73015873015873,
"grad_norm": 0.00020842064986936748,
"learning_rate": 1.7578193092886647e-05,
"loss": 0.0,
"num_input_tokens_seen": 858992,
"step": 4010
},
{
"epoch": 12.746031746031747,
"grad_norm": 0.00023689692898187786,
"learning_rate": 1.751208495533452e-05,
"loss": 0.0,
"num_input_tokens_seen": 860032,
"step": 4015
},
{
"epoch": 12.761904761904763,
"grad_norm": 0.0012503145262598991,
"learning_rate": 1.744603428686533e-05,
"loss": 0.0,
"num_input_tokens_seen": 861136,
"step": 4020
},
{
"epoch": 12.777777777777779,
"grad_norm": 0.0003447967173997313,
"learning_rate": 1.7380041594412084e-05,
"loss": 0.0002,
"num_input_tokens_seen": 862272,
"step": 4025
},
{
"epoch": 12.793650793650794,
"grad_norm": 0.00016737495025154203,
"learning_rate": 1.731410738446284e-05,
"loss": 0.0,
"num_input_tokens_seen": 863280,
"step": 4030
},
{
"epoch": 12.80952380952381,
"grad_norm": 0.0002947594039142132,
"learning_rate": 1.724823216305681e-05,
"loss": 0.0003,
"num_input_tokens_seen": 864432,
"step": 4035
},
{
"epoch": 12.825396825396826,
"grad_norm": 0.00010101118095917627,
"learning_rate": 1.7182416435780454e-05,
"loss": 0.0008,
"num_input_tokens_seen": 865504,
"step": 4040
},
{
"epoch": 12.841269841269842,
"grad_norm": 0.00018932884267996997,
"learning_rate": 1.7116660707763636e-05,
"loss": 0.0,
"num_input_tokens_seen": 866560,
"step": 4045
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.00017417047638446093,
"learning_rate": 1.7050965483675743e-05,
"loss": 0.0,
"num_input_tokens_seen": 867680,
"step": 4050
},
{
"epoch": 12.873015873015873,
"grad_norm": 0.00018292553431820124,
"learning_rate": 1.698533126772177e-05,
"loss": 0.0,
"num_input_tokens_seen": 868800,
"step": 4055
},
{
"epoch": 12.88888888888889,
"grad_norm": 0.00012824099394492805,
"learning_rate": 1.6919758563638504e-05,
"loss": 0.0,
"num_input_tokens_seen": 869824,
"step": 4060
},
{
"epoch": 12.904761904761905,
"grad_norm": 0.004027406685054302,
"learning_rate": 1.6854247874690617e-05,
"loss": 0.0015,
"num_input_tokens_seen": 870912,
"step": 4065
},
{
"epoch": 12.920634920634921,
"grad_norm": 0.00022077480389270931,
"learning_rate": 1.678879970366683e-05,
"loss": 0.0001,
"num_input_tokens_seen": 872000,
"step": 4070
},
{
"epoch": 12.936507936507937,
"grad_norm": 0.0001212661009049043,
"learning_rate": 1.672341455287605e-05,
"loss": 0.0,
"num_input_tokens_seen": 872992,
"step": 4075
},
{
"epoch": 12.952380952380953,
"grad_norm": 0.00012811145279556513,
"learning_rate": 1.6658092924143497e-05,
"loss": 0.0,
"num_input_tokens_seen": 874064,
"step": 4080
},
{
"epoch": 12.968253968253968,
"grad_norm": 0.00033971265656873584,
"learning_rate": 1.6592835318806868e-05,
"loss": 0.0,
"num_input_tokens_seen": 875072,
"step": 4085
},
{
"epoch": 12.984126984126984,
"grad_norm": 0.00036233861465007067,
"learning_rate": 1.6527642237712494e-05,
"loss": 0.0,
"num_input_tokens_seen": 876144,
"step": 4090
},
{
"epoch": 13.0,
"grad_norm": 0.00014307050150819123,
"learning_rate": 1.646251418121148e-05,
"loss": 0.0004,
"num_input_tokens_seen": 877248,
"step": 4095
},
{
"epoch": 13.0,
"eval_loss": 0.19546957314014435,
"eval_runtime": 1.4542,
"eval_samples_per_second": 48.135,
"eval_steps_per_second": 24.068,
"num_input_tokens_seen": 877248,
"step": 4095
},
{
"epoch": 13.015873015873016,
"grad_norm": 0.00017348073015455157,
"learning_rate": 1.639745164915587e-05,
"loss": 0.0,
"num_input_tokens_seen": 878256,
"step": 4100
},
{
"epoch": 13.031746031746032,
"grad_norm": 0.0034831962548196316,
"learning_rate": 1.633245514089482e-05,
"loss": 0.0,
"num_input_tokens_seen": 879296,
"step": 4105
},
{
"epoch": 13.047619047619047,
"grad_norm": 0.0001327778008999303,
"learning_rate": 1.6267525155270773e-05,
"loss": 0.0,
"num_input_tokens_seen": 880448,
"step": 4110
},
{
"epoch": 13.063492063492063,
"grad_norm": 0.000698502582963556,
"learning_rate": 1.6202662190615586e-05,
"loss": 0.0,
"num_input_tokens_seen": 881568,
"step": 4115
},
{
"epoch": 13.079365079365079,
"grad_norm": 0.00014820935030002147,
"learning_rate": 1.6137866744746757e-05,
"loss": 0.0,
"num_input_tokens_seen": 882592,
"step": 4120
},
{
"epoch": 13.095238095238095,
"grad_norm": 0.00021490654035005718,
"learning_rate": 1.607313931496357e-05,
"loss": 0.0,
"num_input_tokens_seen": 883632,
"step": 4125
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.00028194382321089506,
"learning_rate": 1.6008480398043313e-05,
"loss": 0.0,
"num_input_tokens_seen": 884688,
"step": 4130
},
{
"epoch": 13.126984126984127,
"grad_norm": 0.013800282031297684,
"learning_rate": 1.5943890490237433e-05,
"loss": 0.0001,
"num_input_tokens_seen": 885776,
"step": 4135
},
{
"epoch": 13.142857142857142,
"grad_norm": 0.00010271323844790459,
"learning_rate": 1.5879370087267725e-05,
"loss": 0.0,
"num_input_tokens_seen": 886832,
"step": 4140
},
{
"epoch": 13.158730158730158,
"grad_norm": 0.005160802509635687,
"learning_rate": 1.5814919684322545e-05,
"loss": 0.0001,
"num_input_tokens_seen": 887904,
"step": 4145
},
{
"epoch": 13.174603174603174,
"grad_norm": 0.00016589944425504655,
"learning_rate": 1.575053977605303e-05,
"loss": 0.0,
"num_input_tokens_seen": 888976,
"step": 4150
},
{
"epoch": 13.19047619047619,
"grad_norm": 0.000149158135172911,
"learning_rate": 1.5686230856569252e-05,
"loss": 0.0,
"num_input_tokens_seen": 890032,
"step": 4155
},
{
"epoch": 13.206349206349206,
"grad_norm": 0.0001676021929597482,
"learning_rate": 1.5621993419436453e-05,
"loss": 0.0001,
"num_input_tokens_seen": 891136,
"step": 4160
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.00020368795958347619,
"learning_rate": 1.5557827957671248e-05,
"loss": 0.0,
"num_input_tokens_seen": 892256,
"step": 4165
},
{
"epoch": 13.238095238095237,
"grad_norm": 0.00017661222955211997,
"learning_rate": 1.549373496373788e-05,
"loss": 0.0,
"num_input_tokens_seen": 893376,
"step": 4170
},
{
"epoch": 13.253968253968253,
"grad_norm": 0.00016313417290803045,
"learning_rate": 1.542971492954437e-05,
"loss": 0.0,
"num_input_tokens_seen": 894432,
"step": 4175
},
{
"epoch": 13.26984126984127,
"grad_norm": 0.00040456498390994966,
"learning_rate": 1.5365768346438797e-05,
"loss": 0.0,
"num_input_tokens_seen": 895520,
"step": 4180
},
{
"epoch": 13.285714285714286,
"grad_norm": 0.0001557384239276871,
"learning_rate": 1.5301895705205503e-05,
"loss": 0.0,
"num_input_tokens_seen": 896688,
"step": 4185
},
{
"epoch": 13.301587301587302,
"grad_norm": 0.00019953801529482007,
"learning_rate": 1.5238097496061348e-05,
"loss": 0.0,
"num_input_tokens_seen": 897744,
"step": 4190
},
{
"epoch": 13.317460317460318,
"grad_norm": 0.00024302539532072842,
"learning_rate": 1.5174374208651912e-05,
"loss": 0.0,
"num_input_tokens_seen": 898800,
"step": 4195
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.00029446918051689863,
"learning_rate": 1.511072633204777e-05,
"loss": 0.0,
"num_input_tokens_seen": 899904,
"step": 4200
},
{
"epoch": 13.34920634920635,
"grad_norm": 0.0001966770796570927,
"learning_rate": 1.5047154354740717e-05,
"loss": 0.0,
"num_input_tokens_seen": 900928,
"step": 4205
},
{
"epoch": 13.365079365079366,
"grad_norm": 0.0003508214431349188,
"learning_rate": 1.4983658764640039e-05,
"loss": 0.0,
"num_input_tokens_seen": 901984,
"step": 4210
},
{
"epoch": 13.380952380952381,
"grad_norm": 0.00016046679229475558,
"learning_rate": 1.4920240049068748e-05,
"loss": 0.0,
"num_input_tokens_seen": 903008,
"step": 4215
},
{
"epoch": 13.396825396825397,
"grad_norm": 0.0002185764751629904,
"learning_rate": 1.4856898694759855e-05,
"loss": 0.0,
"num_input_tokens_seen": 904032,
"step": 4220
},
{
"epoch": 13.412698412698413,
"grad_norm": 0.00029554381035268307,
"learning_rate": 1.4793635187852622e-05,
"loss": 0.0,
"num_input_tokens_seen": 905040,
"step": 4225
},
{
"epoch": 13.428571428571429,
"grad_norm": 7.924468081910163e-05,
"learning_rate": 1.4730450013888857e-05,
"loss": 0.0133,
"num_input_tokens_seen": 906176,
"step": 4230
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.00014417868806049228,
"learning_rate": 1.4667343657809152e-05,
"loss": 0.0,
"num_input_tokens_seen": 907296,
"step": 4235
},
{
"epoch": 13.46031746031746,
"grad_norm": 0.0023033898323774338,
"learning_rate": 1.4604316603949186e-05,
"loss": 0.0,
"num_input_tokens_seen": 908352,
"step": 4240
},
{
"epoch": 13.476190476190476,
"grad_norm": 0.00021005469898227602,
"learning_rate": 1.4541369336035988e-05,
"loss": 0.0,
"num_input_tokens_seen": 909376,
"step": 4245
},
{
"epoch": 13.492063492063492,
"grad_norm": 0.0001569826272316277,
"learning_rate": 1.4478502337184274e-05,
"loss": 0.0,
"num_input_tokens_seen": 910448,
"step": 4250
},
{
"epoch": 13.507936507936508,
"grad_norm": 0.003406015457585454,
"learning_rate": 1.4415716089892656e-05,
"loss": 0.0,
"num_input_tokens_seen": 911488,
"step": 4255
},
{
"epoch": 13.523809523809524,
"grad_norm": 0.0002489403123036027,
"learning_rate": 1.4353011076040021e-05,
"loss": 0.0,
"num_input_tokens_seen": 912528,
"step": 4260
},
{
"epoch": 13.53968253968254,
"grad_norm": 0.0004728223429992795,
"learning_rate": 1.4290387776881764e-05,
"loss": 0.0,
"num_input_tokens_seen": 913568,
"step": 4265
},
{
"epoch": 13.555555555555555,
"grad_norm": 0.0011484220158308744,
"learning_rate": 1.422784667304615e-05,
"loss": 0.0,
"num_input_tokens_seen": 914656,
"step": 4270
},
{
"epoch": 13.571428571428571,
"grad_norm": 0.00022116370382718742,
"learning_rate": 1.4165388244530608e-05,
"loss": 0.0,
"num_input_tokens_seen": 915728,
"step": 4275
},
{
"epoch": 13.587301587301587,
"grad_norm": 0.0001274219830520451,
"learning_rate": 1.4103012970698016e-05,
"loss": 0.0,
"num_input_tokens_seen": 916816,
"step": 4280
},
{
"epoch": 13.603174603174603,
"grad_norm": 0.00016120154759846628,
"learning_rate": 1.4040721330273062e-05,
"loss": 0.0,
"num_input_tokens_seen": 917904,
"step": 4285
},
{
"epoch": 13.619047619047619,
"grad_norm": 0.0003869999200105667,
"learning_rate": 1.397851380133857e-05,
"loss": 0.0,
"num_input_tokens_seen": 918960,
"step": 4290
},
{
"epoch": 13.634920634920634,
"grad_norm": 0.00033192671253345907,
"learning_rate": 1.3916390861331774e-05,
"loss": 0.0,
"num_input_tokens_seen": 920064,
"step": 4295
},
{
"epoch": 13.65079365079365,
"grad_norm": 0.00014532014029100537,
"learning_rate": 1.3854352987040747e-05,
"loss": 0.0,
"num_input_tokens_seen": 921152,
"step": 4300
},
{
"epoch": 13.666666666666666,
"grad_norm": 0.0001411033299518749,
"learning_rate": 1.379240065460064e-05,
"loss": 0.0,
"num_input_tokens_seen": 922208,
"step": 4305
},
{
"epoch": 13.682539682539682,
"grad_norm": 0.006365750916302204,
"learning_rate": 1.3730534339490114e-05,
"loss": 0.0,
"num_input_tokens_seen": 923312,
"step": 4310
},
{
"epoch": 13.698412698412698,
"grad_norm": 0.0001843793725129217,
"learning_rate": 1.3668754516527655e-05,
"loss": 0.0003,
"num_input_tokens_seen": 924400,
"step": 4315
},
{
"epoch": 13.714285714285714,
"grad_norm": 0.00045810375013388693,
"learning_rate": 1.3607061659867892e-05,
"loss": 0.0,
"num_input_tokens_seen": 925472,
"step": 4320
},
{
"epoch": 13.73015873015873,
"grad_norm": 0.0009504028712399304,
"learning_rate": 1.3545456242998039e-05,
"loss": 0.0,
"num_input_tokens_seen": 926592,
"step": 4325
},
{
"epoch": 13.746031746031747,
"grad_norm": 0.0018185972003266215,
"learning_rate": 1.3483938738734198e-05,
"loss": 0.0,
"num_input_tokens_seen": 927680,
"step": 4330
},
{
"epoch": 13.761904761904763,
"grad_norm": 0.0010512792505323887,
"learning_rate": 1.3422509619217738e-05,
"loss": 0.0,
"num_input_tokens_seen": 928784,
"step": 4335
},
{
"epoch": 13.777777777777779,
"grad_norm": 0.00015952046669553965,
"learning_rate": 1.3361169355911715e-05,
"loss": 0.0,
"num_input_tokens_seen": 929776,
"step": 4340
},
{
"epoch": 13.793650793650794,
"grad_norm": 0.0001598140806891024,
"learning_rate": 1.3299918419597171e-05,
"loss": 0.0,
"num_input_tokens_seen": 930912,
"step": 4345
},
{
"epoch": 13.80952380952381,
"grad_norm": 0.0001969587174244225,
"learning_rate": 1.323875728036964e-05,
"loss": 0.0,
"num_input_tokens_seen": 932048,
"step": 4350
},
{
"epoch": 13.825396825396826,
"grad_norm": 0.0056450143456459045,
"learning_rate": 1.3177686407635417e-05,
"loss": 0.0,
"num_input_tokens_seen": 933152,
"step": 4355
},
{
"epoch": 13.841269841269842,
"grad_norm": 0.00041293379035778344,
"learning_rate": 1.3116706270108015e-05,
"loss": 0.0,
"num_input_tokens_seen": 934240,
"step": 4360
},
{
"epoch": 13.857142857142858,
"grad_norm": 0.00012529375089798123,
"learning_rate": 1.3055817335804582e-05,
"loss": 0.0,
"num_input_tokens_seen": 935328,
"step": 4365
},
{
"epoch": 13.873015873015873,
"grad_norm": 0.0004058619379065931,
"learning_rate": 1.2995020072042285e-05,
"loss": 0.0,
"num_input_tokens_seen": 936400,
"step": 4370
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.0001662838039919734,
"learning_rate": 1.2934314945434734e-05,
"loss": 0.0,
"num_input_tokens_seen": 937456,
"step": 4375
},
{
"epoch": 13.904761904761905,
"grad_norm": 0.0003785460430663079,
"learning_rate": 1.2873702421888365e-05,
"loss": 0.0,
"num_input_tokens_seen": 938496,
"step": 4380
},
{
"epoch": 13.920634920634921,
"grad_norm": 0.0023042745888233185,
"learning_rate": 1.2813182966598902e-05,
"loss": 0.0,
"num_input_tokens_seen": 939568,
"step": 4385
},
{
"epoch": 13.936507936507937,
"grad_norm": 0.00040539150359109044,
"learning_rate": 1.2752757044047827e-05,
"loss": 0.0,
"num_input_tokens_seen": 940592,
"step": 4390
},
{
"epoch": 13.952380952380953,
"grad_norm": 0.0001386718067806214,
"learning_rate": 1.2692425117998699e-05,
"loss": 0.0,
"num_input_tokens_seen": 941632,
"step": 4395
},
{
"epoch": 13.968253968253968,
"grad_norm": 0.00020760892948601395,
"learning_rate": 1.263218765149371e-05,
"loss": 0.0,
"num_input_tokens_seen": 942656,
"step": 4400
},
{
"epoch": 13.984126984126984,
"grad_norm": 0.0005386440316215158,
"learning_rate": 1.257204510685005e-05,
"loss": 0.0,
"num_input_tokens_seen": 943728,
"step": 4405
},
{
"epoch": 14.0,
"grad_norm": 9.081437747227028e-05,
"learning_rate": 1.2511997945656415e-05,
"loss": 0.0,
"num_input_tokens_seen": 944752,
"step": 4410
},
{
"epoch": 14.0,
"eval_loss": 0.19688165187835693,
"eval_runtime": 1.4465,
"eval_samples_per_second": 48.392,
"eval_steps_per_second": 24.196,
"num_input_tokens_seen": 944752,
"step": 4410
},
{
"epoch": 14.015873015873016,
"grad_norm": 0.0016641680849716067,
"learning_rate": 1.2452046628769443e-05,
"loss": 0.0,
"num_input_tokens_seen": 945872,
"step": 4415
},
{
"epoch": 14.031746031746032,
"grad_norm": 0.00016903673531487584,
"learning_rate": 1.2392191616310148e-05,
"loss": 0.0,
"num_input_tokens_seen": 946928,
"step": 4420
},
{
"epoch": 14.047619047619047,
"grad_norm": 0.00014750863192602992,
"learning_rate": 1.2332433367660442e-05,
"loss": 0.0,
"num_input_tokens_seen": 947952,
"step": 4425
},
{
"epoch": 14.063492063492063,
"grad_norm": 0.0013234770158305764,
"learning_rate": 1.227277234145959e-05,
"loss": 0.0,
"num_input_tokens_seen": 949056,
"step": 4430
},
{
"epoch": 14.079365079365079,
"grad_norm": 0.00011105871817562729,
"learning_rate": 1.2213208995600648e-05,
"loss": 0.0,
"num_input_tokens_seen": 950128,
"step": 4435
},
{
"epoch": 14.095238095238095,
"grad_norm": 0.00011416849883971736,
"learning_rate": 1.2153743787227023e-05,
"loss": 0.0,
"num_input_tokens_seen": 951280,
"step": 4440
},
{
"epoch": 14.11111111111111,
"grad_norm": 0.00014661815657746047,
"learning_rate": 1.2094377172728891e-05,
"loss": 0.0,
"num_input_tokens_seen": 952400,
"step": 4445
},
{
"epoch": 14.126984126984127,
"grad_norm": 0.00014966045273467898,
"learning_rate": 1.2035109607739755e-05,
"loss": 0.0,
"num_input_tokens_seen": 953472,
"step": 4450
},
{
"epoch": 14.142857142857142,
"grad_norm": 0.00011174564860993996,
"learning_rate": 1.1975941547132922e-05,
"loss": 0.0001,
"num_input_tokens_seen": 954464,
"step": 4455
},
{
"epoch": 14.158730158730158,
"grad_norm": 0.0003684433759190142,
"learning_rate": 1.1916873445017982e-05,
"loss": 0.0,
"num_input_tokens_seen": 955520,
"step": 4460
},
{
"epoch": 14.174603174603174,
"grad_norm": 0.0008453542250208557,
"learning_rate": 1.185790575473738e-05,
"loss": 0.0,
"num_input_tokens_seen": 956624,
"step": 4465
},
{
"epoch": 14.19047619047619,
"grad_norm": 9.045572369359434e-05,
"learning_rate": 1.1799038928862919e-05,
"loss": 0.0,
"num_input_tokens_seen": 957728,
"step": 4470
},
{
"epoch": 14.206349206349206,
"grad_norm": 0.0008998040575534105,
"learning_rate": 1.1740273419192233e-05,
"loss": 0.0,
"num_input_tokens_seen": 958768,
"step": 4475
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.00014305087097454816,
"learning_rate": 1.1681609676745411e-05,
"loss": 0.0,
"num_input_tokens_seen": 959824,
"step": 4480
},
{
"epoch": 14.238095238095237,
"grad_norm": 0.00012867158511653543,
"learning_rate": 1.1623048151761436e-05,
"loss": 0.0,
"num_input_tokens_seen": 960848,
"step": 4485
},
{
"epoch": 14.253968253968253,
"grad_norm": 8.349162089871243e-05,
"learning_rate": 1.1564589293694855e-05,
"loss": 0.0,
"num_input_tokens_seen": 961984,
"step": 4490
},
{
"epoch": 14.26984126984127,
"grad_norm": 0.00010760652367025614,
"learning_rate": 1.1506233551212186e-05,
"loss": 0.0,
"num_input_tokens_seen": 963072,
"step": 4495
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.0005234842537902296,
"learning_rate": 1.1447981372188563e-05,
"loss": 0.0,
"num_input_tokens_seen": 964176,
"step": 4500
},
{
"epoch": 14.301587301587302,
"grad_norm": 0.00019558188796509057,
"learning_rate": 1.1389833203704294e-05,
"loss": 0.0,
"num_input_tokens_seen": 965232,
"step": 4505
},
{
"epoch": 14.317460317460318,
"grad_norm": 0.00011165087198605761,
"learning_rate": 1.133178949204141e-05,
"loss": 0.0,
"num_input_tokens_seen": 966320,
"step": 4510
},
{
"epoch": 14.333333333333334,
"grad_norm": 0.00018003462173510343,
"learning_rate": 1.1273850682680252e-05,
"loss": 0.0,
"num_input_tokens_seen": 967440,
"step": 4515
},
{
"epoch": 14.34920634920635,
"grad_norm": 0.005392876453697681,
"learning_rate": 1.1216017220296026e-05,
"loss": 0.0,
"num_input_tokens_seen": 968480,
"step": 4520
},
{
"epoch": 14.365079365079366,
"grad_norm": 0.00011155071842949837,
"learning_rate": 1.1158289548755399e-05,
"loss": 0.0,
"num_input_tokens_seen": 969536,
"step": 4525
},
{
"epoch": 14.380952380952381,
"grad_norm": 0.005166823975741863,
"learning_rate": 1.1100668111113166e-05,
"loss": 0.0,
"num_input_tokens_seen": 970608,
"step": 4530
},
{
"epoch": 14.396825396825397,
"grad_norm": 0.008961636573076248,
"learning_rate": 1.104315334960871e-05,
"loss": 0.0,
"num_input_tokens_seen": 971600,
"step": 4535
},
{
"epoch": 14.412698412698413,
"grad_norm": 6.83280813973397e-05,
"learning_rate": 1.0985745705662737e-05,
"loss": 0.0,
"num_input_tokens_seen": 972688,
"step": 4540
},
{
"epoch": 14.428571428571429,
"grad_norm": 0.00025904824724420905,
"learning_rate": 1.0928445619873795e-05,
"loss": 0.0,
"num_input_tokens_seen": 973760,
"step": 4545
},
{
"epoch": 14.444444444444445,
"grad_norm": 9.120917093241587e-05,
"learning_rate": 1.0871253532014969e-05,
"loss": 0.0,
"num_input_tokens_seen": 974832,
"step": 4550
},
{
"epoch": 14.46031746031746,
"grad_norm": 0.0001160187239293009,
"learning_rate": 1.0814169881030459e-05,
"loss": 0.0,
"num_input_tokens_seen": 975904,
"step": 4555
},
{
"epoch": 14.476190476190476,
"grad_norm": 0.00010269311314914376,
"learning_rate": 1.0757195105032198e-05,
"loss": 0.0,
"num_input_tokens_seen": 977008,
"step": 4560
},
{
"epoch": 14.492063492063492,
"grad_norm": 9.548700472805649e-05,
"learning_rate": 1.0700329641296541e-05,
"loss": 0.0,
"num_input_tokens_seen": 978080,
"step": 4565
},
{
"epoch": 14.507936507936508,
"grad_norm": 0.009873582050204277,
"learning_rate": 1.064357392626088e-05,
"loss": 0.0,
"num_input_tokens_seen": 979184,
"step": 4570
},
{
"epoch": 14.523809523809524,
"grad_norm": 0.0005500807310454547,
"learning_rate": 1.0586928395520271e-05,
"loss": 0.0,
"num_input_tokens_seen": 980272,
"step": 4575
},
{
"epoch": 14.53968253968254,
"grad_norm": 0.00010129127622349188,
"learning_rate": 1.053039348382415e-05,
"loss": 0.0,
"num_input_tokens_seen": 981408,
"step": 4580
},
{
"epoch": 14.555555555555555,
"grad_norm": 0.0009250047733075917,
"learning_rate": 1.0473969625072922e-05,
"loss": 0.0,
"num_input_tokens_seen": 982496,
"step": 4585
},
{
"epoch": 14.571428571428571,
"grad_norm": 9.661580406827852e-05,
"learning_rate": 1.0417657252314702e-05,
"loss": 0.0,
"num_input_tokens_seen": 983536,
"step": 4590
},
{
"epoch": 14.587301587301587,
"grad_norm": 0.00022067976533435285,
"learning_rate": 1.0361456797741959e-05,
"loss": 0.0,
"num_input_tokens_seen": 984576,
"step": 4595
},
{
"epoch": 14.603174603174603,
"grad_norm": 0.00020819882047362626,
"learning_rate": 1.0305368692688174e-05,
"loss": 0.0,
"num_input_tokens_seen": 985584,
"step": 4600
},
{
"epoch": 14.619047619047619,
"grad_norm": 0.0002456950314808637,
"learning_rate": 1.0249393367624579e-05,
"loss": 0.0,
"num_input_tokens_seen": 986640,
"step": 4605
},
{
"epoch": 14.634920634920634,
"grad_norm": 0.00010459975601406768,
"learning_rate": 1.0193531252156833e-05,
"loss": 0.0,
"num_input_tokens_seen": 987648,
"step": 4610
},
{
"epoch": 14.65079365079365,
"grad_norm": 9.877282718662173e-05,
"learning_rate": 1.0137782775021686e-05,
"loss": 0.0,
"num_input_tokens_seen": 988768,
"step": 4615
},
{
"epoch": 14.666666666666666,
"grad_norm": 0.00016360699373763055,
"learning_rate": 1.008214836408378e-05,
"loss": 0.0,
"num_input_tokens_seen": 989856,
"step": 4620
},
{
"epoch": 14.682539682539682,
"grad_norm": 0.00014213789836503565,
"learning_rate": 1.0026628446332248e-05,
"loss": 0.0,
"num_input_tokens_seen": 990896,
"step": 4625
},
{
"epoch": 14.698412698412698,
"grad_norm": 0.0007357973954640329,
"learning_rate": 9.97122344787754e-06,
"loss": 0.0,
"num_input_tokens_seen": 991952,
"step": 4630
},
{
"epoch": 14.714285714285714,
"grad_norm": 0.00011429537698859349,
"learning_rate": 9.91593379394811e-06,
"loss": 0.0,
"num_input_tokens_seen": 993008,
"step": 4635
},
{
"epoch": 14.73015873015873,
"grad_norm": 0.00015254374011419713,
"learning_rate": 9.860759908887122e-06,
"loss": 0.0,
"num_input_tokens_seen": 994048,
"step": 4640
},
{
"epoch": 14.746031746031747,
"grad_norm": 0.00013117569324094802,
"learning_rate": 9.805702216149251e-06,
"loss": 0.0,
"num_input_tokens_seen": 995104,
"step": 4645
},
{
"epoch": 14.761904761904763,
"grad_norm": 0.0011479692766442895,
"learning_rate": 9.75076113829741e-06,
"loss": 0.0,
"num_input_tokens_seen": 996208,
"step": 4650
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.00013996977941133082,
"learning_rate": 9.695937096999475e-06,
"loss": 0.0,
"num_input_tokens_seen": 997280,
"step": 4655
},
{
"epoch": 14.793650793650794,
"grad_norm": 0.0001853140420280397,
"learning_rate": 9.641230513025107e-06,
"loss": 0.0,
"num_input_tokens_seen": 998320,
"step": 4660
},
{
"epoch": 14.80952380952381,
"grad_norm": 0.000127577266539447,
"learning_rate": 9.586641806242457e-06,
"loss": 0.0,
"num_input_tokens_seen": 999504,
"step": 4665
},
{
"epoch": 14.825396825396826,
"grad_norm": 0.002337446203455329,
"learning_rate": 9.532171395615036e-06,
"loss": 0.0,
"num_input_tokens_seen": 1000576,
"step": 4670
},
{
"epoch": 14.841269841269842,
"grad_norm": 0.00014514128270093352,
"learning_rate": 9.477819699198379e-06,
"loss": 0.0002,
"num_input_tokens_seen": 1001616,
"step": 4675
},
{
"epoch": 14.857142857142858,
"grad_norm": 0.00025934414588846266,
"learning_rate": 9.423587134136949e-06,
"loss": 0.0,
"num_input_tokens_seen": 1002704,
"step": 4680
},
{
"epoch": 14.873015873015873,
"grad_norm": 0.0001345455675618723,
"learning_rate": 9.369474116660848e-06,
"loss": 0.0,
"num_input_tokens_seen": 1003776,
"step": 4685
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.00010907748946920037,
"learning_rate": 9.315481062082687e-06,
"loss": 0.0,
"num_input_tokens_seen": 1004816,
"step": 4690
},
{
"epoch": 14.904761904761905,
"grad_norm": 0.013357513584196568,
"learning_rate": 9.261608384794374e-06,
"loss": 0.0,
"num_input_tokens_seen": 1005888,
"step": 4695
},
{
"epoch": 14.920634920634921,
"grad_norm": 9.384198347106576e-05,
"learning_rate": 9.207856498263902e-06,
"loss": 0.0,
"num_input_tokens_seen": 1006976,
"step": 4700
},
{
"epoch": 14.936507936507937,
"grad_norm": 0.007067098747938871,
"learning_rate": 9.154225815032242e-06,
"loss": 0.0,
"num_input_tokens_seen": 1008064,
"step": 4705
},
{
"epoch": 14.952380952380953,
"grad_norm": 0.0002748421102296561,
"learning_rate": 9.100716746710126e-06,
"loss": 0.0,
"num_input_tokens_seen": 1009120,
"step": 4710
},
{
"epoch": 14.968253968253968,
"grad_norm": 0.00014957235543988645,
"learning_rate": 9.047329703974888e-06,
"loss": 0.0,
"num_input_tokens_seen": 1010192,
"step": 4715
},
{
"epoch": 14.984126984126984,
"grad_norm": 0.00050693703815341,
"learning_rate": 8.994065096567355e-06,
"loss": 0.0,
"num_input_tokens_seen": 1011248,
"step": 4720
},
{
"epoch": 15.0,
"grad_norm": 0.00014735362492501736,
"learning_rate": 8.940923333288643e-06,
"loss": 0.0,
"num_input_tokens_seen": 1012272,
"step": 4725
},
{
"epoch": 15.0,
"eval_loss": 0.19820746779441833,
"eval_runtime": 1.444,
"eval_samples_per_second": 48.478,
"eval_steps_per_second": 24.239,
"num_input_tokens_seen": 1012272,
"step": 4725
},
{
"epoch": 15.015873015873016,
"grad_norm": 0.00021902845764998347,
"learning_rate": 8.88790482199707e-06,
"loss": 0.0,
"num_input_tokens_seen": 1013296,
"step": 4730
},
{
"epoch": 15.031746031746032,
"grad_norm": 0.00099784170743078,
"learning_rate": 8.835009969605012e-06,
"loss": 0.0,
"num_input_tokens_seen": 1014384,
"step": 4735
},
{
"epoch": 15.047619047619047,
"grad_norm": 0.000178317932295613,
"learning_rate": 8.78223918207575e-06,
"loss": 0.0,
"num_input_tokens_seen": 1015472,
"step": 4740
},
{
"epoch": 15.063492063492063,
"grad_norm": 0.00011670400999719277,
"learning_rate": 8.729592864420394e-06,
"loss": 0.0,
"num_input_tokens_seen": 1016544,
"step": 4745
},
{
"epoch": 15.079365079365079,
"grad_norm": 0.003956442698836327,
"learning_rate": 8.677071420694769e-06,
"loss": 0.0,
"num_input_tokens_seen": 1017664,
"step": 4750
},
{
"epoch": 15.095238095238095,
"grad_norm": 0.000169846520293504,
"learning_rate": 8.62467525399627e-06,
"loss": 0.0,
"num_input_tokens_seen": 1018784,
"step": 4755
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.00024720613146200776,
"learning_rate": 8.572404766460846e-06,
"loss": 0.0,
"num_input_tokens_seen": 1019808,
"step": 4760
},
{
"epoch": 15.126984126984127,
"grad_norm": 0.00011122092109872028,
"learning_rate": 8.520260359259822e-06,
"loss": 0.0,
"num_input_tokens_seen": 1020896,
"step": 4765
},
{
"epoch": 15.142857142857142,
"grad_norm": 0.0001617560046724975,
"learning_rate": 8.468242432596904e-06,
"loss": 0.0,
"num_input_tokens_seen": 1022000,
"step": 4770
},
{
"epoch": 15.158730158730158,
"grad_norm": 0.00025247674784623086,
"learning_rate": 8.41635138570507e-06,
"loss": 0.0,
"num_input_tokens_seen": 1023088,
"step": 4775
},
{
"epoch": 15.174603174603174,
"grad_norm": 0.0001408685347996652,
"learning_rate": 8.364587616843477e-06,
"loss": 0.0,
"num_input_tokens_seen": 1024192,
"step": 4780
},
{
"epoch": 15.19047619047619,
"grad_norm": 0.00010588414443191141,
"learning_rate": 8.312951523294462e-06,
"loss": 0.0,
"num_input_tokens_seen": 1025232,
"step": 4785
},
{
"epoch": 15.206349206349206,
"grad_norm": 7.913958688732237e-05,
"learning_rate": 8.261443501360466e-06,
"loss": 0.0,
"num_input_tokens_seen": 1026304,
"step": 4790
},
{
"epoch": 15.222222222222221,
"grad_norm": 0.00014240090968087316,
"learning_rate": 8.210063946360964e-06,
"loss": 0.0,
"num_input_tokens_seen": 1027424,
"step": 4795
},
{
"epoch": 15.238095238095237,
"grad_norm": 0.00010943754023173824,
"learning_rate": 8.158813252629497e-06,
"loss": 0.0,
"num_input_tokens_seen": 1028496,
"step": 4800
},
{
"epoch": 15.253968253968253,
"grad_norm": 0.0002636691788211465,
"learning_rate": 8.107691813510562e-06,
"loss": 0.0,
"num_input_tokens_seen": 1029584,
"step": 4805
},
{
"epoch": 15.26984126984127,
"grad_norm": 0.00046621993533335626,
"learning_rate": 8.056700021356694e-06,
"loss": 0.0,
"num_input_tokens_seen": 1030672,
"step": 4810
},
{
"epoch": 15.285714285714286,
"grad_norm": 0.00011222544708289206,
"learning_rate": 8.005838267525356e-06,
"loss": 0.0,
"num_input_tokens_seen": 1031680,
"step": 4815
},
{
"epoch": 15.301587301587302,
"grad_norm": 0.00013111262524034828,
"learning_rate": 7.955106942375985e-06,
"loss": 0.0,
"num_input_tokens_seen": 1032720,
"step": 4820
},
{
"epoch": 15.317460317460318,
"grad_norm": 0.0001799971651053056,
"learning_rate": 7.904506435266998e-06,
"loss": 0.0,
"num_input_tokens_seen": 1033728,
"step": 4825
},
{
"epoch": 15.333333333333334,
"grad_norm": 0.007164331618696451,
"learning_rate": 7.854037134552797e-06,
"loss": 0.0,
"num_input_tokens_seen": 1034784,
"step": 4830
},
{
"epoch": 15.34920634920635,
"grad_norm": 0.00014734613068867475,
"learning_rate": 7.803699427580789e-06,
"loss": 0.0,
"num_input_tokens_seen": 1035872,
"step": 4835
},
{
"epoch": 15.365079365079366,
"grad_norm": 0.003119000233709812,
"learning_rate": 7.753493700688397e-06,
"loss": 0.0,
"num_input_tokens_seen": 1036960,
"step": 4840
},
{
"epoch": 15.380952380952381,
"grad_norm": 0.00016122111992444843,
"learning_rate": 7.703420339200101e-06,
"loss": 0.0,
"num_input_tokens_seen": 1038064,
"step": 4845
},
{
"epoch": 15.396825396825397,
"grad_norm": 0.00026882500969804823,
"learning_rate": 7.653479727424534e-06,
"loss": 0.0,
"num_input_tokens_seen": 1039152,
"step": 4850
},
{
"epoch": 15.412698412698413,
"grad_norm": 9.242565283784643e-05,
"learning_rate": 7.603672248651431e-06,
"loss": 0.0,
"num_input_tokens_seen": 1040240,
"step": 4855
},
{
"epoch": 15.428571428571429,
"grad_norm": 0.003781526582315564,
"learning_rate": 7.553998285148786e-06,
"loss": 0.0,
"num_input_tokens_seen": 1041264,
"step": 4860
},
{
"epoch": 15.444444444444445,
"grad_norm": 0.00021264157840050757,
"learning_rate": 7.504458218159841e-06,
"loss": 0.0,
"num_input_tokens_seen": 1042288,
"step": 4865
},
{
"epoch": 15.46031746031746,
"grad_norm": 0.0001260324497707188,
"learning_rate": 7.455052427900213e-06,
"loss": 0.0,
"num_input_tokens_seen": 1043392,
"step": 4870
},
{
"epoch": 15.476190476190476,
"grad_norm": 0.00015318683290388435,
"learning_rate": 7.405781293554973e-06,
"loss": 0.0,
"num_input_tokens_seen": 1044496,
"step": 4875
},
{
"epoch": 15.492063492063492,
"grad_norm": 0.0002370552538195625,
"learning_rate": 7.3566451932756744e-06,
"loss": 0.0,
"num_input_tokens_seen": 1045504,
"step": 4880
},
{
"epoch": 15.507936507936508,
"grad_norm": 0.0016014057910069823,
"learning_rate": 7.307644504177538e-06,
"loss": 0.0,
"num_input_tokens_seen": 1046592,
"step": 4885
},
{
"epoch": 15.523809523809524,
"grad_norm": 0.00011496651131892577,
"learning_rate": 7.258779602336504e-06,
"loss": 0.0,
"num_input_tokens_seen": 1047728,
"step": 4890
},
{
"epoch": 15.53968253968254,
"grad_norm": 0.0014618715504184365,
"learning_rate": 7.210050862786341e-06,
"loss": 0.0,
"num_input_tokens_seen": 1048848,
"step": 4895
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.00013625272549688816,
"learning_rate": 7.161458659515813e-06,
"loss": 0.0,
"num_input_tokens_seen": 1049872,
"step": 4900
},
{
"epoch": 15.571428571428571,
"grad_norm": 0.00021962377650197595,
"learning_rate": 7.113003365465745e-06,
"loss": 0.0,
"num_input_tokens_seen": 1050944,
"step": 4905
},
{
"epoch": 15.587301587301587,
"grad_norm": 0.006843153852969408,
"learning_rate": 7.064685352526229e-06,
"loss": 0.0,
"num_input_tokens_seen": 1051968,
"step": 4910
},
{
"epoch": 15.603174603174603,
"grad_norm": 0.00014317109889816493,
"learning_rate": 7.016504991533726e-06,
"loss": 0.0,
"num_input_tokens_seen": 1053104,
"step": 4915
},
{
"epoch": 15.619047619047619,
"grad_norm": 0.00021036296675447375,
"learning_rate": 6.9684626522682154e-06,
"loss": 0.0,
"num_input_tokens_seen": 1054144,
"step": 4920
},
{
"epoch": 15.634920634920634,
"grad_norm": 0.0006890058284625411,
"learning_rate": 6.920558703450389e-06,
"loss": 0.0,
"num_input_tokens_seen": 1055168,
"step": 4925
},
{
"epoch": 15.65079365079365,
"grad_norm": 7.351509702857584e-05,
"learning_rate": 6.872793512738809e-06,
"loss": 0.0,
"num_input_tokens_seen": 1056208,
"step": 4930
},
{
"epoch": 15.666666666666666,
"grad_norm": 0.00024063632008619606,
"learning_rate": 6.825167446727057e-06,
"loss": 0.0,
"num_input_tokens_seen": 1057280,
"step": 4935
},
{
"epoch": 15.682539682539682,
"grad_norm": 0.006952513474971056,
"learning_rate": 6.777680870940972e-06,
"loss": 0.0,
"num_input_tokens_seen": 1058352,
"step": 4940
},
{
"epoch": 15.698412698412698,
"grad_norm": 0.0001248103944817558,
"learning_rate": 6.730334149835788e-06,
"loss": 0.0,
"num_input_tokens_seen": 1059408,
"step": 4945
},
{
"epoch": 15.714285714285714,
"grad_norm": 9.666664118412882e-05,
"learning_rate": 6.683127646793411e-06,
"loss": 0.0,
"num_input_tokens_seen": 1060528,
"step": 4950
},
{
"epoch": 15.73015873015873,
"grad_norm": 0.0012612607097253203,
"learning_rate": 6.636061724119541e-06,
"loss": 0.0,
"num_input_tokens_seen": 1061600,
"step": 4955
},
{
"epoch": 15.746031746031747,
"grad_norm": 0.00018189875117968768,
"learning_rate": 6.589136743040955e-06,
"loss": 0.0,
"num_input_tokens_seen": 1062640,
"step": 4960
},
{
"epoch": 15.761904761904763,
"grad_norm": 0.00012195859744679183,
"learning_rate": 6.542353063702716e-06,
"loss": 0.0,
"num_input_tokens_seen": 1063648,
"step": 4965
},
{
"epoch": 15.777777777777779,
"grad_norm": 0.005103013478219509,
"learning_rate": 6.495711045165412e-06,
"loss": 0.0001,
"num_input_tokens_seen": 1064752,
"step": 4970
},
{
"epoch": 15.793650793650794,
"grad_norm": 0.00019367771164979786,
"learning_rate": 6.449211045402395e-06,
"loss": 0.0,
"num_input_tokens_seen": 1065808,
"step": 4975
},
{
"epoch": 15.80952380952381,
"grad_norm": 0.00011932419874938205,
"learning_rate": 6.402853421297034e-06,
"loss": 0.0,
"num_input_tokens_seen": 1066848,
"step": 4980
},
{
"epoch": 15.825396825396826,
"grad_norm": 9.995235450332984e-05,
"learning_rate": 6.356638528639955e-06,
"loss": 0.0,
"num_input_tokens_seen": 1067808,
"step": 4985
},
{
"epoch": 15.841269841269842,
"grad_norm": 0.00012767007865477353,
"learning_rate": 6.3105667221263845e-06,
"loss": 0.0,
"num_input_tokens_seen": 1068912,
"step": 4990
},
{
"epoch": 15.857142857142858,
"grad_norm": 0.0002976842224597931,
"learning_rate": 6.2646383553533275e-06,
"loss": 0.0,
"num_input_tokens_seen": 1070000,
"step": 4995
},
{
"epoch": 15.873015873015873,
"grad_norm": 0.00011814333993243054,
"learning_rate": 6.218853780816933e-06,
"loss": 0.0,
"num_input_tokens_seen": 1071040,
"step": 5000
},
{
"epoch": 15.88888888888889,
"grad_norm": 0.00021936133271083236,
"learning_rate": 6.173213349909729e-06,
"loss": 0.0,
"num_input_tokens_seen": 1072144,
"step": 5005
},
{
"epoch": 15.904761904761905,
"grad_norm": 0.00013133411994203925,
"learning_rate": 6.127717412917977e-06,
"loss": 0.0,
"num_input_tokens_seen": 1073216,
"step": 5010
},
{
"epoch": 15.920634920634921,
"grad_norm": 0.00015381992852780968,
"learning_rate": 6.082366319018959e-06,
"loss": 0.0,
"num_input_tokens_seen": 1074272,
"step": 5015
},
{
"epoch": 15.936507936507937,
"grad_norm": 0.0039579374715685844,
"learning_rate": 6.037160416278278e-06,
"loss": 0.0,
"num_input_tokens_seen": 1075408,
"step": 5020
},
{
"epoch": 15.952380952380953,
"grad_norm": 6.8703229771927e-05,
"learning_rate": 5.9921000516472315e-06,
"loss": 0.0,
"num_input_tokens_seen": 1076496,
"step": 5025
},
{
"epoch": 15.968253968253968,
"grad_norm": 0.00011043099220842123,
"learning_rate": 5.947185570960123e-06,
"loss": 0.0,
"num_input_tokens_seen": 1077600,
"step": 5030
},
{
"epoch": 15.984126984126984,
"grad_norm": 0.00043672084575518966,
"learning_rate": 5.902417318931589e-06,
"loss": 0.0,
"num_input_tokens_seen": 1078736,
"step": 5035
},
{
"epoch": 16.0,
"grad_norm": 0.00018926948541775346,
"learning_rate": 5.857795639153998e-06,
"loss": 0.0,
"num_input_tokens_seen": 1079744,
"step": 5040
},
{
"epoch": 16.0,
"eval_loss": 0.20095133781433105,
"eval_runtime": 1.4576,
"eval_samples_per_second": 48.023,
"eval_steps_per_second": 24.011,
"num_input_tokens_seen": 1079744,
"step": 5040
},
{
"epoch": 16.015873015873016,
"grad_norm": 0.00011885230196639895,
"learning_rate": 5.813320874094771e-06,
"loss": 0.0,
"num_input_tokens_seen": 1080784,
"step": 5045
},
{
"epoch": 16.03174603174603,
"grad_norm": 0.0002344203821849078,
"learning_rate": 5.768993365093783e-06,
"loss": 0.0,
"num_input_tokens_seen": 1081808,
"step": 5050
},
{
"epoch": 16.047619047619047,
"grad_norm": 9.413900988874957e-05,
"learning_rate": 5.724813452360736e-06,
"loss": 0.0,
"num_input_tokens_seen": 1082864,
"step": 5055
},
{
"epoch": 16.063492063492063,
"grad_norm": 0.0001468745176680386,
"learning_rate": 5.6807814749725245e-06,
"loss": 0.0,
"num_input_tokens_seen": 1083888,
"step": 5060
},
{
"epoch": 16.07936507936508,
"grad_norm": 0.0007511080475524068,
"learning_rate": 5.636897770870666e-06,
"loss": 0.0,
"num_input_tokens_seen": 1085008,
"step": 5065
},
{
"epoch": 16.095238095238095,
"grad_norm": 0.0004616309597622603,
"learning_rate": 5.593162676858707e-06,
"loss": 0.0,
"num_input_tokens_seen": 1086112,
"step": 5070
},
{
"epoch": 16.11111111111111,
"grad_norm": 0.00021627935348078609,
"learning_rate": 5.54957652859959e-06,
"loss": 0.0,
"num_input_tokens_seen": 1087168,
"step": 5075
},
{
"epoch": 16.126984126984127,
"grad_norm": 0.00010006018419517204,
"learning_rate": 5.506139660613147e-06,
"loss": 0.0,
"num_input_tokens_seen": 1088272,
"step": 5080
},
{
"epoch": 16.142857142857142,
"grad_norm": 0.0001312010281253606,
"learning_rate": 5.462852406273464e-06,
"loss": 0.0,
"num_input_tokens_seen": 1089344,
"step": 5085
},
{
"epoch": 16.158730158730158,
"grad_norm": 0.00013077599578537047,
"learning_rate": 5.4197150978063965e-06,
"loss": 0.0,
"num_input_tokens_seen": 1090368,
"step": 5090
},
{
"epoch": 16.174603174603174,
"grad_norm": 0.00010001740884035826,
"learning_rate": 5.376728066286943e-06,
"loss": 0.0,
"num_input_tokens_seen": 1091440,
"step": 5095
},
{
"epoch": 16.19047619047619,
"grad_norm": 0.00011268608795944601,
"learning_rate": 5.333891641636748e-06,
"loss": 0.0,
"num_input_tokens_seen": 1092496,
"step": 5100
},
{
"epoch": 16.206349206349206,
"grad_norm": 0.0007534271571785212,
"learning_rate": 5.291206152621572e-06,
"loss": 0.0,
"num_input_tokens_seen": 1093520,
"step": 5105
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.004776482004672289,
"learning_rate": 5.248671926848753e-06,
"loss": 0.0,
"num_input_tokens_seen": 1094608,
"step": 5110
},
{
"epoch": 16.238095238095237,
"grad_norm": 0.00011033907503588125,
"learning_rate": 5.206289290764702e-06,
"loss": 0.0,
"num_input_tokens_seen": 1095664,
"step": 5115
},
{
"epoch": 16.253968253968253,
"grad_norm": 0.00014399575593415648,
"learning_rate": 5.164058569652377e-06,
"loss": 0.0,
"num_input_tokens_seen": 1096720,
"step": 5120
},
{
"epoch": 16.26984126984127,
"grad_norm": 0.0005759440246038139,
"learning_rate": 5.121980087628803e-06,
"loss": 0.0,
"num_input_tokens_seen": 1097744,
"step": 5125
},
{
"epoch": 16.285714285714285,
"grad_norm": 9.312896872870624e-05,
"learning_rate": 5.080054167642617e-06,
"loss": 0.0,
"num_input_tokens_seen": 1098816,
"step": 5130
},
{
"epoch": 16.3015873015873,
"grad_norm": 0.00014533007924910635,
"learning_rate": 5.038281131471514e-06,
"loss": 0.0,
"num_input_tokens_seen": 1099904,
"step": 5135
},
{
"epoch": 16.317460317460316,
"grad_norm": 0.00013747379125561565,
"learning_rate": 4.996661299719846e-06,
"loss": 0.0,
"num_input_tokens_seen": 1100944,
"step": 5140
},
{
"epoch": 16.333333333333332,
"grad_norm": 0.002072168281301856,
"learning_rate": 4.955194991816114e-06,
"loss": 0.0,
"num_input_tokens_seen": 1102000,
"step": 5145
},
{
"epoch": 16.349206349206348,
"grad_norm": 0.00013336709525901824,
"learning_rate": 4.913882526010555e-06,
"loss": 0.0,
"num_input_tokens_seen": 1103056,
"step": 5150
},
{
"epoch": 16.365079365079364,
"grad_norm": 0.00018689218268264085,
"learning_rate": 4.872724219372679e-06,
"loss": 0.0,
"num_input_tokens_seen": 1104192,
"step": 5155
},
{
"epoch": 16.38095238095238,
"grad_norm": 0.00011830328730866313,
"learning_rate": 4.831720387788827e-06,
"loss": 0.0,
"num_input_tokens_seen": 1105232,
"step": 5160
},
{
"epoch": 16.396825396825395,
"grad_norm": 0.00030842830892652273,
"learning_rate": 4.790871345959764e-06,
"loss": 0.0,
"num_input_tokens_seen": 1106224,
"step": 5165
},
{
"epoch": 16.41269841269841,
"grad_norm": 0.0011698472080752254,
"learning_rate": 4.750177407398268e-06,
"loss": 0.0,
"num_input_tokens_seen": 1107264,
"step": 5170
},
{
"epoch": 16.428571428571427,
"grad_norm": 0.0031321838032454252,
"learning_rate": 4.70963888442669e-06,
"loss": 0.0,
"num_input_tokens_seen": 1108352,
"step": 5175
},
{
"epoch": 16.444444444444443,
"grad_norm": 0.0015201118076220155,
"learning_rate": 4.669256088174606e-06,
"loss": 0.0,
"num_input_tokens_seen": 1109504,
"step": 5180
},
{
"epoch": 16.46031746031746,
"grad_norm": 0.000647184147965163,
"learning_rate": 4.629029328576381e-06,
"loss": 0.0,
"num_input_tokens_seen": 1110608,
"step": 5185
},
{
"epoch": 16.476190476190474,
"grad_norm": 0.0013111300067976117,
"learning_rate": 4.588958914368824e-06,
"loss": 0.0,
"num_input_tokens_seen": 1111696,
"step": 5190
},
{
"epoch": 16.49206349206349,
"grad_norm": 0.0003529720415826887,
"learning_rate": 4.549045153088813e-06,
"loss": 0.0,
"num_input_tokens_seen": 1112800,
"step": 5195
},
{
"epoch": 16.507936507936506,
"grad_norm": 0.00014995434321463108,
"learning_rate": 4.5092883510709085e-06,
"loss": 0.0,
"num_input_tokens_seen": 1113840,
"step": 5200
},
{
"epoch": 16.523809523809526,
"grad_norm": 6.491740350611508e-05,
"learning_rate": 4.469688813445042e-06,
"loss": 0.0,
"num_input_tokens_seen": 1114928,
"step": 5205
},
{
"epoch": 16.53968253968254,
"grad_norm": 0.0002835427294485271,
"learning_rate": 4.4302468441341536e-06,
"loss": 0.0,
"num_input_tokens_seen": 1115968,
"step": 5210
},
{
"epoch": 16.555555555555557,
"grad_norm": 0.00012973738193977624,
"learning_rate": 4.39096274585184e-06,
"loss": 0.0,
"num_input_tokens_seen": 1117040,
"step": 5215
},
{
"epoch": 16.571428571428573,
"grad_norm": 0.0009239883511327207,
"learning_rate": 4.3518368201000834e-06,
"loss": 0.0,
"num_input_tokens_seen": 1118032,
"step": 5220
},
{
"epoch": 16.58730158730159,
"grad_norm": 0.00019911407434847206,
"learning_rate": 4.312869367166875e-06,
"loss": 0.0,
"num_input_tokens_seen": 1119152,
"step": 5225
},
{
"epoch": 16.603174603174605,
"grad_norm": 0.0006706177373416722,
"learning_rate": 4.274060686123959e-06,
"loss": 0.0,
"num_input_tokens_seen": 1120192,
"step": 5230
},
{
"epoch": 16.61904761904762,
"grad_norm": 0.00014612732047680765,
"learning_rate": 4.235411074824524e-06,
"loss": 0.0,
"num_input_tokens_seen": 1121280,
"step": 5235
},
{
"epoch": 16.634920634920636,
"grad_norm": 0.0002357373887207359,
"learning_rate": 4.196920829900891e-06,
"loss": 0.0,
"num_input_tokens_seen": 1122352,
"step": 5240
},
{
"epoch": 16.650793650793652,
"grad_norm": 0.001285207225009799,
"learning_rate": 4.158590246762279e-06,
"loss": 0.0,
"num_input_tokens_seen": 1123376,
"step": 5245
},
{
"epoch": 16.666666666666668,
"grad_norm": 8.80981533555314e-05,
"learning_rate": 4.120419619592511e-06,
"loss": 0.0,
"num_input_tokens_seen": 1124400,
"step": 5250
},
{
"epoch": 16.682539682539684,
"grad_norm": 0.00017228191427420825,
"learning_rate": 4.082409241347754e-06,
"loss": 0.0,
"num_input_tokens_seen": 1125440,
"step": 5255
},
{
"epoch": 16.6984126984127,
"grad_norm": 0.0001255661336472258,
"learning_rate": 4.044559403754294e-06,
"loss": 0.0,
"num_input_tokens_seen": 1126464,
"step": 5260
},
{
"epoch": 16.714285714285715,
"grad_norm": 0.009000623598694801,
"learning_rate": 4.006870397306256e-06,
"loss": 0.0,
"num_input_tokens_seen": 1127552,
"step": 5265
},
{
"epoch": 16.73015873015873,
"grad_norm": 0.000503276998642832,
"learning_rate": 3.969342511263441e-06,
"loss": 0.0,
"num_input_tokens_seen": 1128640,
"step": 5270
},
{
"epoch": 16.746031746031747,
"grad_norm": 0.00038995477370917797,
"learning_rate": 3.931976033649021e-06,
"loss": 0.0,
"num_input_tokens_seen": 1129712,
"step": 5275
},
{
"epoch": 16.761904761904763,
"grad_norm": 0.00010485357779543847,
"learning_rate": 3.8947712512474085e-06,
"loss": 0.0,
"num_input_tokens_seen": 1130800,
"step": 5280
},
{
"epoch": 16.77777777777778,
"grad_norm": 7.115237531252205e-05,
"learning_rate": 3.857728449601991e-06,
"loss": 0.0,
"num_input_tokens_seen": 1131888,
"step": 5285
},
{
"epoch": 16.793650793650794,
"grad_norm": 0.0003514425188768655,
"learning_rate": 3.820847913012987e-06,
"loss": 0.0,
"num_input_tokens_seen": 1132960,
"step": 5290
},
{
"epoch": 16.80952380952381,
"grad_norm": 0.00010385180939920247,
"learning_rate": 3.784129924535243e-06,
"loss": 0.0,
"num_input_tokens_seen": 1133984,
"step": 5295
},
{
"epoch": 16.825396825396826,
"grad_norm": 0.00014585713506676257,
"learning_rate": 3.7475747659760502e-06,
"loss": 0.0,
"num_input_tokens_seen": 1135088,
"step": 5300
},
{
"epoch": 16.841269841269842,
"grad_norm": 9.117177978623658e-05,
"learning_rate": 3.7111827178930108e-06,
"loss": 0.0,
"num_input_tokens_seen": 1136176,
"step": 5305
},
{
"epoch": 16.857142857142858,
"grad_norm": 0.00011191629891982302,
"learning_rate": 3.6749540595918675e-06,
"loss": 0.0,
"num_input_tokens_seen": 1137280,
"step": 5310
},
{
"epoch": 16.873015873015873,
"grad_norm": 0.00021030911011621356,
"learning_rate": 3.6388890691243403e-06,
"loss": 0.0,
"num_input_tokens_seen": 1138320,
"step": 5315
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.004643842577934265,
"learning_rate": 3.6029880232860413e-06,
"loss": 0.0,
"num_input_tokens_seen": 1139360,
"step": 5320
},
{
"epoch": 16.904761904761905,
"grad_norm": 0.00018325127894058824,
"learning_rate": 3.5672511976142963e-06,
"loss": 0.0,
"num_input_tokens_seen": 1140448,
"step": 5325
},
{
"epoch": 16.92063492063492,
"grad_norm": 0.00018302863463759422,
"learning_rate": 3.531678866386076e-06,
"loss": 0.0,
"num_input_tokens_seen": 1141488,
"step": 5330
},
{
"epoch": 16.936507936507937,
"grad_norm": 0.0017416487680748105,
"learning_rate": 3.4962713026158694e-06,
"loss": 0.0,
"num_input_tokens_seen": 1142656,
"step": 5335
},
{
"epoch": 16.952380952380953,
"grad_norm": 0.0002722022181842476,
"learning_rate": 3.461028778053571e-06,
"loss": 0.0,
"num_input_tokens_seen": 1143728,
"step": 5340
},
{
"epoch": 16.96825396825397,
"grad_norm": 0.0025406088680028915,
"learning_rate": 3.4259515631824306e-06,
"loss": 0.0,
"num_input_tokens_seen": 1144880,
"step": 5345
},
{
"epoch": 16.984126984126984,
"grad_norm": 0.000209580481168814,
"learning_rate": 3.3910399272169657e-06,
"loss": 0.0,
"num_input_tokens_seen": 1146032,
"step": 5350
},
{
"epoch": 17.0,
"grad_norm": 0.0001166212823591195,
"learning_rate": 3.356294138100868e-06,
"loss": 0.0,
"num_input_tokens_seen": 1147088,
"step": 5355
},
{
"epoch": 17.0,
"eval_loss": 0.20359259843826294,
"eval_runtime": 1.4479,
"eval_samples_per_second": 48.345,
"eval_steps_per_second": 24.173,
"num_input_tokens_seen": 1147088,
"step": 5355
},
{
"epoch": 17.015873015873016,
"grad_norm": 0.00018003687728196383,
"learning_rate": 3.321714462504999e-06,
"loss": 0.0,
"num_input_tokens_seen": 1148144,
"step": 5360
},
{
"epoch": 17.03174603174603,
"grad_norm": 0.0001391788391629234,
"learning_rate": 3.2873011658252796e-06,
"loss": 0.0,
"num_input_tokens_seen": 1149248,
"step": 5365
},
{
"epoch": 17.047619047619047,
"grad_norm": 0.00013040899648331106,
"learning_rate": 3.2530545121807145e-06,
"loss": 0.0,
"num_input_tokens_seen": 1150384,
"step": 5370
},
{
"epoch": 17.063492063492063,
"grad_norm": 0.00015444679593201727,
"learning_rate": 3.2189747644113365e-06,
"loss": 0.0,
"num_input_tokens_seen": 1151424,
"step": 5375
},
{
"epoch": 17.07936507936508,
"grad_norm": 6.173488509375602e-05,
"learning_rate": 3.185062184076168e-06,
"loss": 0.0,
"num_input_tokens_seen": 1152512,
"step": 5380
},
{
"epoch": 17.095238095238095,
"grad_norm": 0.003296441398561001,
"learning_rate": 3.151317031451259e-06,
"loss": 0.0,
"num_input_tokens_seen": 1153568,
"step": 5385
},
{
"epoch": 17.11111111111111,
"grad_norm": 0.00010462482168804854,
"learning_rate": 3.1177395655276635e-06,
"loss": 0.0,
"num_input_tokens_seen": 1154672,
"step": 5390
},
{
"epoch": 17.126984126984127,
"grad_norm": 0.003144120331853628,
"learning_rate": 3.0843300440094397e-06,
"loss": 0.0,
"num_input_tokens_seen": 1155776,
"step": 5395
},
{
"epoch": 17.142857142857142,
"grad_norm": 0.0001563921687193215,
"learning_rate": 3.0510887233117096e-06,
"loss": 0.0,
"num_input_tokens_seen": 1156784,
"step": 5400
},
{
"epoch": 17.158730158730158,
"grad_norm": 0.00010624121932778507,
"learning_rate": 3.0180158585586397e-06,
"loss": 0.0,
"num_input_tokens_seen": 1157888,
"step": 5405
},
{
"epoch": 17.174603174603174,
"grad_norm": 8.494222856825218e-05,
"learning_rate": 2.98511170358155e-06,
"loss": 0.0,
"num_input_tokens_seen": 1158912,
"step": 5410
},
{
"epoch": 17.19047619047619,
"grad_norm": 0.00045355354086495936,
"learning_rate": 2.9523765109169017e-06,
"loss": 0.0,
"num_input_tokens_seen": 1159968,
"step": 5415
},
{
"epoch": 17.206349206349206,
"grad_norm": 9.295267227571458e-05,
"learning_rate": 2.9198105318043816e-06,
"loss": 0.0,
"num_input_tokens_seen": 1161008,
"step": 5420
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.0004320423468016088,
"learning_rate": 2.8874140161849917e-06,
"loss": 0.0,
"num_input_tokens_seen": 1162064,
"step": 5425
},
{
"epoch": 17.238095238095237,
"grad_norm": 0.00011025634739780799,
"learning_rate": 2.8551872126991147e-06,
"loss": 0.0,
"num_input_tokens_seen": 1163104,
"step": 5430
},
{
"epoch": 17.253968253968253,
"grad_norm": 0.00024053626111708581,
"learning_rate": 2.8231303686846124e-06,
"loss": 0.0,
"num_input_tokens_seen": 1164160,
"step": 5435
},
{
"epoch": 17.26984126984127,
"grad_norm": 0.0006347851594910026,
"learning_rate": 2.7912437301749026e-06,
"loss": 0.0,
"num_input_tokens_seen": 1165216,
"step": 5440
},
{
"epoch": 17.285714285714285,
"grad_norm": 0.004288922995328903,
"learning_rate": 2.759527541897103e-06,
"loss": 0.0,
"num_input_tokens_seen": 1166240,
"step": 5445
},
{
"epoch": 17.3015873015873,
"grad_norm": 0.00011301678750896826,
"learning_rate": 2.7279820472701554e-06,
"loss": 0.0,
"num_input_tokens_seen": 1167296,
"step": 5450
},
{
"epoch": 17.317460317460316,
"grad_norm": 0.0005049300380051136,
"learning_rate": 2.6966074884029164e-06,
"loss": 0.0,
"num_input_tokens_seen": 1168384,
"step": 5455
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.00013295926328282803,
"learning_rate": 2.665404106092348e-06,
"loss": 0.0,
"num_input_tokens_seen": 1169520,
"step": 5460
},
{
"epoch": 17.349206349206348,
"grad_norm": 0.006194146350026131,
"learning_rate": 2.634372139821631e-06,
"loss": 0.0,
"num_input_tokens_seen": 1170608,
"step": 5465
},
{
"epoch": 17.365079365079364,
"grad_norm": 0.00011388435086701065,
"learning_rate": 2.603511827758351e-06,
"loss": 0.0,
"num_input_tokens_seen": 1171696,
"step": 5470
},
{
"epoch": 17.38095238095238,
"grad_norm": 0.0018900822615250945,
"learning_rate": 2.57282340675267e-06,
"loss": 0.0,
"num_input_tokens_seen": 1172752,
"step": 5475
},
{
"epoch": 17.396825396825395,
"grad_norm": 0.0002572809753473848,
"learning_rate": 2.5423071123354845e-06,
"loss": 0.0,
"num_input_tokens_seen": 1173792,
"step": 5480
},
{
"epoch": 17.41269841269841,
"grad_norm": 0.0001357399160042405,
"learning_rate": 2.5119631787166474e-06,
"loss": 0.0,
"num_input_tokens_seen": 1174848,
"step": 5485
},
{
"epoch": 17.428571428571427,
"grad_norm": 0.007136253640055656,
"learning_rate": 2.4817918387831594e-06,
"loss": 0.0,
"num_input_tokens_seen": 1175920,
"step": 5490
},
{
"epoch": 17.444444444444443,
"grad_norm": 0.001479033729992807,
"learning_rate": 2.451793324097365e-06,
"loss": 0.0,
"num_input_tokens_seen": 1177024,
"step": 5495
},
{
"epoch": 17.46031746031746,
"grad_norm": 0.00012026441982015967,
"learning_rate": 2.421967864895211e-06,
"loss": 0.0,
"num_input_tokens_seen": 1178016,
"step": 5500
},
{
"epoch": 17.476190476190474,
"grad_norm": 8.03721122792922e-05,
"learning_rate": 2.3923156900844372e-06,
"loss": 0.0,
"num_input_tokens_seen": 1179136,
"step": 5505
},
{
"epoch": 17.49206349206349,
"grad_norm": 7.77265740907751e-05,
"learning_rate": 2.3628370272428564e-06,
"loss": 0.0,
"num_input_tokens_seen": 1180224,
"step": 5510
},
{
"epoch": 17.507936507936506,
"grad_norm": 0.002345818327739835,
"learning_rate": 2.3335321026165895e-06,
"loss": 0.0,
"num_input_tokens_seen": 1181312,
"step": 5515
},
{
"epoch": 17.523809523809526,
"grad_norm": 0.00011413685570005327,
"learning_rate": 2.304401141118326e-06,
"loss": 0.0,
"num_input_tokens_seen": 1182352,
"step": 5520
},
{
"epoch": 17.53968253968254,
"grad_norm": 0.00014744508371222764,
"learning_rate": 2.275444366325613e-06,
"loss": 0.0,
"num_input_tokens_seen": 1183504,
"step": 5525
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.0006693506147712469,
"learning_rate": 2.2466620004791244e-06,
"loss": 0.0,
"num_input_tokens_seen": 1184608,
"step": 5530
},
{
"epoch": 17.571428571428573,
"grad_norm": 0.0002359232894377783,
"learning_rate": 2.2180542644809564e-06,
"loss": 0.0,
"num_input_tokens_seen": 1185648,
"step": 5535
},
{
"epoch": 17.58730158730159,
"grad_norm": 0.00016359401342924684,
"learning_rate": 2.1896213778929533e-06,
"loss": 0.0,
"num_input_tokens_seen": 1186704,
"step": 5540
},
{
"epoch": 17.603174603174605,
"grad_norm": 0.0006329666939564049,
"learning_rate": 2.1613635589349756e-06,
"loss": 0.0,
"num_input_tokens_seen": 1187744,
"step": 5545
},
{
"epoch": 17.61904761904762,
"grad_norm": 0.004720310214906931,
"learning_rate": 2.133281024483297e-06,
"loss": 0.0,
"num_input_tokens_seen": 1188784,
"step": 5550
},
{
"epoch": 17.634920634920636,
"grad_norm": 0.001770343049429357,
"learning_rate": 2.105373990068862e-06,
"loss": 0.0,
"num_input_tokens_seen": 1189808,
"step": 5555
},
{
"epoch": 17.650793650793652,
"grad_norm": 0.0001479105558246374,
"learning_rate": 2.077642669875679e-06,
"loss": 0.0,
"num_input_tokens_seen": 1190880,
"step": 5560
},
{
"epoch": 17.666666666666668,
"grad_norm": 9.104243508772925e-05,
"learning_rate": 2.050087276739171e-06,
"loss": 0.0,
"num_input_tokens_seen": 1192032,
"step": 5565
},
{
"epoch": 17.682539682539684,
"grad_norm": 0.00013047012907918543,
"learning_rate": 2.0227080221445345e-06,
"loss": 0.0,
"num_input_tokens_seen": 1193136,
"step": 5570
},
{
"epoch": 17.6984126984127,
"grad_norm": 0.00011812873708549887,
"learning_rate": 1.9955051162251216e-06,
"loss": 0.0,
"num_input_tokens_seen": 1194208,
"step": 5575
},
{
"epoch": 17.714285714285715,
"grad_norm": 0.003984814044088125,
"learning_rate": 1.968478767760812e-06,
"loss": 0.0,
"num_input_tokens_seen": 1195216,
"step": 5580
},
{
"epoch": 17.73015873015873,
"grad_norm": 0.0007169300224632025,
"learning_rate": 1.941629184176422e-06,
"loss": 0.0,
"num_input_tokens_seen": 1196272,
"step": 5585
},
{
"epoch": 17.746031746031747,
"grad_norm": 0.0008787508704699576,
"learning_rate": 1.9149565715401415e-06,
"loss": 0.0,
"num_input_tokens_seen": 1197328,
"step": 5590
},
{
"epoch": 17.761904761904763,
"grad_norm": 0.00010687024041544646,
"learning_rate": 1.8884611345618863e-06,
"loss": 0.0,
"num_input_tokens_seen": 1198368,
"step": 5595
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.00012383893772494048,
"learning_rate": 1.8621430765917964e-06,
"loss": 0.0,
"num_input_tokens_seen": 1199488,
"step": 5600
},
{
"epoch": 17.793650793650794,
"grad_norm": 0.00018226118118036538,
"learning_rate": 1.8360025996186137e-06,
"loss": 0.0,
"num_input_tokens_seen": 1200592,
"step": 5605
},
{
"epoch": 17.80952380952381,
"grad_norm": 9.4871676992625e-05,
"learning_rate": 1.8100399042681848e-06,
"loss": 0.0,
"num_input_tokens_seen": 1201584,
"step": 5610
},
{
"epoch": 17.825396825396826,
"grad_norm": 0.00025049285613931715,
"learning_rate": 1.784255189801895e-06,
"loss": 0.0,
"num_input_tokens_seen": 1202656,
"step": 5615
},
{
"epoch": 17.841269841269842,
"grad_norm": 0.0001846710656536743,
"learning_rate": 1.7586486541151303e-06,
"loss": 0.0,
"num_input_tokens_seen": 1203744,
"step": 5620
},
{
"epoch": 17.857142857142858,
"grad_norm": 0.0005062821437604725,
"learning_rate": 1.7332204937357793e-06,
"loss": 0.0,
"num_input_tokens_seen": 1204864,
"step": 5625
},
{
"epoch": 17.873015873015873,
"grad_norm": 8.464482380077243e-05,
"learning_rate": 1.7079709038227227e-06,
"loss": 0.0,
"num_input_tokens_seen": 1205920,
"step": 5630
},
{
"epoch": 17.88888888888889,
"grad_norm": 8.669216913403943e-05,
"learning_rate": 1.6829000781643094e-06,
"loss": 0.0,
"num_input_tokens_seen": 1206992,
"step": 5635
},
{
"epoch": 17.904761904761905,
"grad_norm": 0.00011535276280483231,
"learning_rate": 1.6580082091769088e-06,
"loss": 0.0,
"num_input_tokens_seen": 1208048,
"step": 5640
},
{
"epoch": 17.92063492063492,
"grad_norm": 0.0006412939983420074,
"learning_rate": 1.633295487903394e-06,
"loss": 0.0,
"num_input_tokens_seen": 1209104,
"step": 5645
},
{
"epoch": 17.936507936507937,
"grad_norm": 0.00021313075558282435,
"learning_rate": 1.6087621040117157e-06,
"loss": 0.0,
"num_input_tokens_seen": 1210160,
"step": 5650
},
{
"epoch": 17.952380952380953,
"grad_norm": 0.0002721626660786569,
"learning_rate": 1.5844082457934145e-06,
"loss": 0.0,
"num_input_tokens_seen": 1211264,
"step": 5655
},
{
"epoch": 17.96825396825397,
"grad_norm": 9.288136789109558e-05,
"learning_rate": 1.5602341001621834e-06,
"loss": 0.0,
"num_input_tokens_seen": 1212288,
"step": 5660
},
{
"epoch": 17.984126984126984,
"grad_norm": 0.00033692075521685183,
"learning_rate": 1.5362398526524463e-06,
"loss": 0.0,
"num_input_tokens_seen": 1213360,
"step": 5665
},
{
"epoch": 18.0,
"grad_norm": 0.00020394432067405432,
"learning_rate": 1.5124256874179288e-06,
"loss": 0.0,
"num_input_tokens_seen": 1214432,
"step": 5670
},
{
"epoch": 18.0,
"eval_loss": 0.20379270613193512,
"eval_runtime": 1.4505,
"eval_samples_per_second": 48.26,
"eval_steps_per_second": 24.13,
"num_input_tokens_seen": 1214432,
"step": 5670
},
{
"epoch": 18.015873015873016,
"grad_norm": 0.00013542307715397328,
"learning_rate": 1.4887917872302231e-06,
"loss": 0.0,
"num_input_tokens_seen": 1215504,
"step": 5675
},
{
"epoch": 18.03174603174603,
"grad_norm": 0.000211275095352903,
"learning_rate": 1.465338333477423e-06,
"loss": 0.0,
"num_input_tokens_seen": 1216560,
"step": 5680
},
{
"epoch": 18.047619047619047,
"grad_norm": 0.0007974806358106434,
"learning_rate": 1.4420655061626932e-06,
"loss": 0.0,
"num_input_tokens_seen": 1217584,
"step": 5685
},
{
"epoch": 18.063492063492063,
"grad_norm": 0.0027897171676158905,
"learning_rate": 1.4189734839029273e-06,
"loss": 0.0,
"num_input_tokens_seen": 1218720,
"step": 5690
},
{
"epoch": 18.07936507936508,
"grad_norm": 7.086082769092172e-05,
"learning_rate": 1.3960624439273428e-06,
"loss": 0.0,
"num_input_tokens_seen": 1219744,
"step": 5695
},
{
"epoch": 18.095238095238095,
"grad_norm": 6.145464431028813e-05,
"learning_rate": 1.3733325620761294e-06,
"loss": 0.0,
"num_input_tokens_seen": 1220912,
"step": 5700
},
{
"epoch": 18.11111111111111,
"grad_norm": 0.0001662890863372013,
"learning_rate": 1.3507840127991138e-06,
"loss": 0.0,
"num_input_tokens_seen": 1221904,
"step": 5705
},
{
"epoch": 18.126984126984127,
"grad_norm": 9.012035297928378e-05,
"learning_rate": 1.328416969154414e-06,
"loss": 0.0,
"num_input_tokens_seen": 1222944,
"step": 5710
},
{
"epoch": 18.142857142857142,
"grad_norm": 0.0004921794170513749,
"learning_rate": 1.3062316028071065e-06,
"loss": 0.0,
"num_input_tokens_seen": 1224016,
"step": 5715
},
{
"epoch": 18.158730158730158,
"grad_norm": 0.00010398301674285904,
"learning_rate": 1.2842280840278997e-06,
"loss": 0.0,
"num_input_tokens_seen": 1225120,
"step": 5720
},
{
"epoch": 18.174603174603174,
"grad_norm": 7.405834185192361e-05,
"learning_rate": 1.2624065816918413e-06,
"loss": 0.0,
"num_input_tokens_seen": 1226160,
"step": 5725
},
{
"epoch": 18.19047619047619,
"grad_norm": 0.0009600871126167476,
"learning_rate": 1.2407672632770374e-06,
"loss": 0.0,
"num_input_tokens_seen": 1227296,
"step": 5730
},
{
"epoch": 18.206349206349206,
"grad_norm": 0.00020508236775640398,
"learning_rate": 1.219310294863324e-06,
"loss": 0.0,
"num_input_tokens_seen": 1228352,
"step": 5735
},
{
"epoch": 18.22222222222222,
"grad_norm": 7.430932600982487e-05,
"learning_rate": 1.1980358411310344e-06,
"loss": 0.0,
"num_input_tokens_seen": 1229344,
"step": 5740
},
{
"epoch": 18.238095238095237,
"grad_norm": 0.00010048592957900837,
"learning_rate": 1.1769440653597141e-06,
"loss": 0.0,
"num_input_tokens_seen": 1230416,
"step": 5745
},
{
"epoch": 18.253968253968253,
"grad_norm": 9.934873378369957e-05,
"learning_rate": 1.1560351294268579e-06,
"loss": 0.0,
"num_input_tokens_seen": 1231488,
"step": 5750
},
{
"epoch": 18.26984126984127,
"grad_norm": 0.0001546377025078982,
"learning_rate": 1.1353091938067023e-06,
"loss": 0.0,
"num_input_tokens_seen": 1232512,
"step": 5755
},
{
"epoch": 18.285714285714285,
"grad_norm": 6.141650374047458e-05,
"learning_rate": 1.1147664175689577e-06,
"loss": 0.0,
"num_input_tokens_seen": 1233568,
"step": 5760
},
{
"epoch": 18.3015873015873,
"grad_norm": 0.00037183158565312624,
"learning_rate": 1.0944069583776057e-06,
"loss": 0.0,
"num_input_tokens_seen": 1234656,
"step": 5765
},
{
"epoch": 18.317460317460316,
"grad_norm": 9.08130532479845e-05,
"learning_rate": 1.0742309724896925e-06,
"loss": 0.0,
"num_input_tokens_seen": 1235760,
"step": 5770
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.0001254588714800775,
"learning_rate": 1.0542386147541133e-06,
"loss": 0.0,
"num_input_tokens_seen": 1236816,
"step": 5775
},
{
"epoch": 18.349206349206348,
"grad_norm": 0.0001581174583407119,
"learning_rate": 1.03443003861044e-06,
"loss": 0.0,
"num_input_tokens_seen": 1237920,
"step": 5780
},
{
"epoch": 18.365079365079364,
"grad_norm": 0.0005327375256456435,
"learning_rate": 1.0148053960877396e-06,
"loss": 0.0,
"num_input_tokens_seen": 1238960,
"step": 5785
},
{
"epoch": 18.38095238095238,
"grad_norm": 0.00010738349374150857,
"learning_rate": 9.95364837803392e-07,
"loss": 0.0,
"num_input_tokens_seen": 1239984,
"step": 5790
},
{
"epoch": 18.396825396825395,
"grad_norm": 0.00017572024080436677,
"learning_rate": 9.761085129619597e-07,
"loss": 0.0,
"num_input_tokens_seen": 1241104,
"step": 5795
},
{
"epoch": 18.41269841269841,
"grad_norm": 0.00010885854135267437,
"learning_rate": 9.570365693540251e-07,
"loss": 0.0,
"num_input_tokens_seen": 1242192,
"step": 5800
},
{
"epoch": 18.428571428571427,
"grad_norm": 0.005107260309159756,
"learning_rate": 9.381491533550612e-07,
"loss": 0.0,
"num_input_tokens_seen": 1243248,
"step": 5805
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.0001170175164588727,
"learning_rate": 9.194464099243128e-07,
"loss": 0.0,
"num_input_tokens_seen": 1244304,
"step": 5810
},
{
"epoch": 18.46031746031746,
"grad_norm": 0.00015847616305109113,
"learning_rate": 9.009284826036691e-07,
"loss": 0.0,
"num_input_tokens_seen": 1245344,
"step": 5815
},
{
"epoch": 18.476190476190474,
"grad_norm": 8.687510126037523e-05,
"learning_rate": 8.825955135165764e-07,
"loss": 0.0,
"num_input_tokens_seen": 1246400,
"step": 5820
},
{
"epoch": 18.49206349206349,
"grad_norm": 0.0011609104694798589,
"learning_rate": 8.64447643366953e-07,
"loss": 0.0,
"num_input_tokens_seen": 1247488,
"step": 5825
},
{
"epoch": 18.507936507936506,
"grad_norm": 0.004330518189817667,
"learning_rate": 8.464850114380807e-07,
"loss": 0.0,
"num_input_tokens_seen": 1248576,
"step": 5830
},
{
"epoch": 18.523809523809526,
"grad_norm": 0.00015605830412823707,
"learning_rate": 8.287077555915706e-07,
"loss": 0.0,
"num_input_tokens_seen": 1249648,
"step": 5835
},
{
"epoch": 18.53968253968254,
"grad_norm": 0.0001275492977583781,
"learning_rate": 8.111160122662748e-07,
"loss": 0.0,
"num_input_tokens_seen": 1250704,
"step": 5840
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.00016683620924595743,
"learning_rate": 7.937099164772699e-07,
"loss": 0.0,
"num_input_tokens_seen": 1251728,
"step": 5845
},
{
"epoch": 18.571428571428573,
"grad_norm": 0.0003593855944927782,
"learning_rate": 7.764896018147921e-07,
"loss": 0.0,
"num_input_tokens_seen": 1252816,
"step": 5850
},
{
"epoch": 18.58730158730159,
"grad_norm": 0.0005089179612696171,
"learning_rate": 7.594552004432265e-07,
"loss": 0.0,
"num_input_tokens_seen": 1253920,
"step": 5855
},
{
"epoch": 18.603174603174605,
"grad_norm": 0.006525584030896425,
"learning_rate": 7.426068431000882e-07,
"loss": 0.0,
"num_input_tokens_seen": 1254976,
"step": 5860
},
{
"epoch": 18.61904761904762,
"grad_norm": 0.00029004551470279694,
"learning_rate": 7.259446590950264e-07,
"loss": 0.0,
"num_input_tokens_seen": 1256128,
"step": 5865
},
{
"epoch": 18.634920634920636,
"grad_norm": 0.0001299941068282351,
"learning_rate": 7.094687763088248e-07,
"loss": 0.0,
"num_input_tokens_seen": 1257200,
"step": 5870
},
{
"epoch": 18.650793650793652,
"grad_norm": 0.00023286275973077863,
"learning_rate": 6.931793211924192e-07,
"loss": 0.0,
"num_input_tokens_seen": 1258256,
"step": 5875
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.00012554308341350406,
"learning_rate": 6.770764187659262e-07,
"loss": 0.0,
"num_input_tokens_seen": 1259312,
"step": 5880
},
{
"epoch": 18.682539682539684,
"grad_norm": 0.002065224340185523,
"learning_rate": 6.611601926177019e-07,
"loss": 0.0,
"num_input_tokens_seen": 1260352,
"step": 5885
},
{
"epoch": 18.6984126984127,
"grad_norm": 6.723072146996856e-05,
"learning_rate": 6.454307649033569e-07,
"loss": 0.0,
"num_input_tokens_seen": 1261456,
"step": 5890
},
{
"epoch": 18.714285714285715,
"grad_norm": 0.00010569453297648579,
"learning_rate": 6.298882563448599e-07,
"loss": 0.0,
"num_input_tokens_seen": 1262560,
"step": 5895
},
{
"epoch": 18.73015873015873,
"grad_norm": 0.0001583786215633154,
"learning_rate": 6.145327862295824e-07,
"loss": 0.0,
"num_input_tokens_seen": 1263648,
"step": 5900
},
{
"epoch": 18.746031746031747,
"grad_norm": 0.00013246589514892548,
"learning_rate": 5.993644724093888e-07,
"loss": 0.0,
"num_input_tokens_seen": 1264768,
"step": 5905
},
{
"epoch": 18.761904761904763,
"grad_norm": 8.11208738014102e-05,
"learning_rate": 5.843834312997481e-07,
"loss": 0.0,
"num_input_tokens_seen": 1265888,
"step": 5910
},
{
"epoch": 18.77777777777778,
"grad_norm": 8.87354981387034e-05,
"learning_rate": 5.695897778788151e-07,
"loss": 0.0,
"num_input_tokens_seen": 1266976,
"step": 5915
},
{
"epoch": 18.793650793650794,
"grad_norm": 0.00012595752195920795,
"learning_rate": 5.549836256865642e-07,
"loss": 0.0,
"num_input_tokens_seen": 1268064,
"step": 5920
},
{
"epoch": 18.80952380952381,
"grad_norm": 0.0027428085450083017,
"learning_rate": 5.405650868239242e-07,
"loss": 0.0,
"num_input_tokens_seen": 1269104,
"step": 5925
},
{
"epoch": 18.825396825396826,
"grad_norm": 0.008050281554460526,
"learning_rate": 5.263342719518921e-07,
"loss": 0.0,
"num_input_tokens_seen": 1270192,
"step": 5930
},
{
"epoch": 18.841269841269842,
"grad_norm": 0.0020299924071878195,
"learning_rate": 5.122912902907145e-07,
"loss": 0.0,
"num_input_tokens_seen": 1271264,
"step": 5935
},
{
"epoch": 18.857142857142858,
"grad_norm": 0.00032130113686434925,
"learning_rate": 4.98436249619022e-07,
"loss": 0.0,
"num_input_tokens_seen": 1272368,
"step": 5940
},
{
"epoch": 18.873015873015873,
"grad_norm": 0.002536200685426593,
"learning_rate": 4.847692562730238e-07,
"loss": 0.0,
"num_input_tokens_seen": 1273440,
"step": 5945
},
{
"epoch": 18.88888888888889,
"grad_norm": 0.000218776855035685,
"learning_rate": 4.712904151456865e-07,
"loss": 0.0,
"num_input_tokens_seen": 1274528,
"step": 5950
},
{
"epoch": 18.904761904761905,
"grad_norm": 0.00021643297804985195,
"learning_rate": 4.579998296859067e-07,
"loss": 0.0,
"num_input_tokens_seen": 1275584,
"step": 5955
},
{
"epoch": 18.92063492063492,
"grad_norm": 0.00038543707341887057,
"learning_rate": 4.448976018977563e-07,
"loss": 0.0,
"num_input_tokens_seen": 1276672,
"step": 5960
},
{
"epoch": 18.936507936507937,
"grad_norm": 0.0002801400551106781,
"learning_rate": 4.319838323396691e-07,
"loss": 0.0,
"num_input_tokens_seen": 1277792,
"step": 5965
},
{
"epoch": 18.952380952380953,
"grad_norm": 0.00012454042735043913,
"learning_rate": 4.192586201236748e-07,
"loss": 0.0,
"num_input_tokens_seen": 1278832,
"step": 5970
},
{
"epoch": 18.96825396825397,
"grad_norm": 0.0001259175915038213,
"learning_rate": 4.067220629146523e-07,
"loss": 0.0,
"num_input_tokens_seen": 1279888,
"step": 5975
},
{
"epoch": 18.984126984126984,
"grad_norm": 0.00011188061034772545,
"learning_rate": 3.943742569295583e-07,
"loss": 0.0,
"num_input_tokens_seen": 1281008,
"step": 5980
},
{
"epoch": 19.0,
"grad_norm": 0.0001576906506670639,
"learning_rate": 3.8221529693671375e-07,
"loss": 0.0,
"num_input_tokens_seen": 1282000,
"step": 5985
},
{
"epoch": 19.0,
"eval_loss": 0.2037855088710785,
"eval_runtime": 1.4442,
"eval_samples_per_second": 48.469,
"eval_steps_per_second": 24.234,
"num_input_tokens_seen": 1282000,
"step": 5985
},
{
"epoch": 19.015873015873016,
"grad_norm": 0.00013823146582581103,
"learning_rate": 3.702452762550546e-07,
"loss": 0.0,
"num_input_tokens_seen": 1283104,
"step": 5990
},
{
"epoch": 19.03174603174603,
"grad_norm": 9.556670556776226e-05,
"learning_rate": 3.5846428675342657e-07,
"loss": 0.0,
"num_input_tokens_seen": 1284096,
"step": 5995
},
{
"epoch": 19.047619047619047,
"grad_norm": 0.0001208999747177586,
"learning_rate": 3.468724188498751e-07,
"loss": 0.0,
"num_input_tokens_seen": 1285152,
"step": 6000
},
{
"epoch": 19.063492063492063,
"grad_norm": 0.0001655189407756552,
"learning_rate": 3.3546976151095924e-07,
"loss": 0.0,
"num_input_tokens_seen": 1286240,
"step": 6005
},
{
"epoch": 19.07936507936508,
"grad_norm": 0.0003586008388083428,
"learning_rate": 3.242564022510608e-07,
"loss": 0.0,
"num_input_tokens_seen": 1287328,
"step": 6010
},
{
"epoch": 19.095238095238095,
"grad_norm": 0.00048587197670713067,
"learning_rate": 3.132324271317183e-07,
"loss": 0.0,
"num_input_tokens_seen": 1288448,
"step": 6015
},
{
"epoch": 19.11111111111111,
"grad_norm": 0.00016730540664866567,
"learning_rate": 3.0239792076095506e-07,
"loss": 0.0,
"num_input_tokens_seen": 1289456,
"step": 6020
},
{
"epoch": 19.126984126984127,
"grad_norm": 0.0003036040288861841,
"learning_rate": 2.9175296629265493e-07,
"loss": 0.0,
"num_input_tokens_seen": 1290512,
"step": 6025
},
{
"epoch": 19.142857142857142,
"grad_norm": 0.0002358776400797069,
"learning_rate": 2.8129764542589033e-07,
"loss": 0.0,
"num_input_tokens_seen": 1291584,
"step": 6030
},
{
"epoch": 19.158730158730158,
"grad_norm": 0.007324682082980871,
"learning_rate": 2.71032038404323e-07,
"loss": 0.0,
"num_input_tokens_seen": 1292624,
"step": 6035
},
{
"epoch": 19.174603174603174,
"grad_norm": 0.00022556190378963947,
"learning_rate": 2.609562240155766e-07,
"loss": 0.0,
"num_input_tokens_seen": 1293680,
"step": 6040
},
{
"epoch": 19.19047619047619,
"grad_norm": 0.0002528807381168008,
"learning_rate": 2.510702795906289e-07,
"loss": 0.0,
"num_input_tokens_seen": 1294800,
"step": 6045
},
{
"epoch": 19.206349206349206,
"grad_norm": 0.00019078988407272846,
"learning_rate": 2.413742810032288e-07,
"loss": 0.0,
"num_input_tokens_seen": 1295888,
"step": 6050
},
{
"epoch": 19.22222222222222,
"grad_norm": 0.00042369638686068356,
"learning_rate": 2.318683026692997e-07,
"loss": 0.0,
"num_input_tokens_seen": 1296976,
"step": 6055
},
{
"epoch": 19.238095238095237,
"grad_norm": 0.00010991137969540432,
"learning_rate": 2.2255241754638167e-07,
"loss": 0.0,
"num_input_tokens_seen": 1298048,
"step": 6060
},
{
"epoch": 19.253968253968253,
"grad_norm": 0.00027539842994883657,
"learning_rate": 2.1342669713307063e-07,
"loss": 0.0,
"num_input_tokens_seen": 1299088,
"step": 6065
},
{
"epoch": 19.26984126984127,
"grad_norm": 9.853248775471002e-05,
"learning_rate": 2.0449121146845774e-07,
"loss": 0.0,
"num_input_tokens_seen": 1300144,
"step": 6070
},
{
"epoch": 19.285714285714285,
"grad_norm": 0.00015365192666649818,
"learning_rate": 1.9574602913159934e-07,
"loss": 0.0,
"num_input_tokens_seen": 1301296,
"step": 6075
},
{
"epoch": 19.3015873015873,
"grad_norm": 0.00011225073831155896,
"learning_rate": 1.8719121724099508e-07,
"loss": 0.0,
"num_input_tokens_seen": 1302320,
"step": 6080
},
{
"epoch": 19.317460317460316,
"grad_norm": 0.000277652230579406,
"learning_rate": 1.7882684145406614e-07,
"loss": 0.0,
"num_input_tokens_seen": 1303408,
"step": 6085
},
{
"epoch": 19.333333333333332,
"grad_norm": 9.394479275215417e-05,
"learning_rate": 1.706529659666556e-07,
"loss": 0.0,
"num_input_tokens_seen": 1304448,
"step": 6090
},
{
"epoch": 19.349206349206348,
"grad_norm": 0.004892684053629637,
"learning_rate": 1.6266965351252884e-07,
"loss": 0.0,
"num_input_tokens_seen": 1305552,
"step": 6095
},
{
"epoch": 19.365079365079364,
"grad_norm": 0.00015211277059279382,
"learning_rate": 1.5487696536290176e-07,
"loss": 0.0,
"num_input_tokens_seen": 1306592,
"step": 6100
},
{
"epoch": 19.38095238095238,
"grad_norm": 0.00013414997374638915,
"learning_rate": 1.472749613259661e-07,
"loss": 0.0,
"num_input_tokens_seen": 1307632,
"step": 6105
},
{
"epoch": 19.396825396825395,
"grad_norm": 0.00033985593472607434,
"learning_rate": 1.398636997464231e-07,
"loss": 0.0,
"num_input_tokens_seen": 1308720,
"step": 6110
},
{
"epoch": 19.41269841269841,
"grad_norm": 8.913094643503428e-05,
"learning_rate": 1.326432375050479e-07,
"loss": 0.0,
"num_input_tokens_seen": 1309824,
"step": 6115
},
{
"epoch": 19.428571428571427,
"grad_norm": 0.00010829928942257538,
"learning_rate": 1.2561363001824812e-07,
"loss": 0.0,
"num_input_tokens_seen": 1310832,
"step": 6120
},
{
"epoch": 19.444444444444443,
"grad_norm": 0.00025828013895079494,
"learning_rate": 1.1877493123763905e-07,
"loss": 0.0,
"num_input_tokens_seen": 1311936,
"step": 6125
},
{
"epoch": 19.46031746031746,
"grad_norm": 0.0003705843409989029,
"learning_rate": 1.1212719364962209e-07,
"loss": 0.0,
"num_input_tokens_seen": 1312992,
"step": 6130
},
{
"epoch": 19.476190476190474,
"grad_norm": 0.00011419185466365889,
"learning_rate": 1.0567046827499594e-07,
"loss": 0.0,
"num_input_tokens_seen": 1314080,
"step": 6135
},
{
"epoch": 19.49206349206349,
"grad_norm": 8.912238990888e-05,
"learning_rate": 9.940480466855417e-08,
"loss": 0.0,
"num_input_tokens_seen": 1315200,
"step": 6140
},
{
"epoch": 19.507936507936506,
"grad_norm": 0.00011151758371852338,
"learning_rate": 9.333025091870506e-08,
"loss": 0.0,
"num_input_tokens_seen": 1316256,
"step": 6145
},
{
"epoch": 19.523809523809526,
"grad_norm": 0.00011681997420964763,
"learning_rate": 8.744685364711624e-08,
"loss": 0.0,
"num_input_tokens_seen": 1317328,
"step": 6150
},
{
"epoch": 19.53968253968254,
"grad_norm": 0.000633031188044697,
"learning_rate": 8.17546580083317e-08,
"loss": 0.0,
"num_input_tokens_seen": 1318368,
"step": 6155
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.0011906184954568744,
"learning_rate": 7.625370768944984e-08,
"loss": 0.0,
"num_input_tokens_seen": 1319440,
"step": 6160
},
{
"epoch": 19.571428571428573,
"grad_norm": 9.492343087913468e-05,
"learning_rate": 7.094404490977923e-08,
"loss": 0.0,
"num_input_tokens_seen": 1320464,
"step": 6165
},
{
"epoch": 19.58730158730159,
"grad_norm": 8.97967693163082e-05,
"learning_rate": 6.582571042050567e-08,
"loss": 0.0,
"num_input_tokens_seen": 1321552,
"step": 6170
},
{
"epoch": 19.603174603174605,
"grad_norm": 0.00012746088032145053,
"learning_rate": 6.089874350439506e-08,
"loss": 0.0,
"num_input_tokens_seen": 1322576,
"step": 6175
},
{
"epoch": 19.61904761904762,
"grad_norm": 0.001789805362932384,
"learning_rate": 5.6163181975477096e-08,
"loss": 0.0,
"num_input_tokens_seen": 1323632,
"step": 6180
},
{
"epoch": 19.634920634920636,
"grad_norm": 0.00017200844013132155,
"learning_rate": 5.161906217877044e-08,
"loss": 0.0,
"num_input_tokens_seen": 1324704,
"step": 6185
},
{
"epoch": 19.650793650793652,
"grad_norm": 0.0007840655161999166,
"learning_rate": 4.726641898998574e-08,
"loss": 0.0,
"num_input_tokens_seen": 1325840,
"step": 6190
},
{
"epoch": 19.666666666666668,
"grad_norm": 0.005137111991643906,
"learning_rate": 4.310528581527862e-08,
"loss": 0.0,
"num_input_tokens_seen": 1326896,
"step": 6195
},
{
"epoch": 19.682539682539684,
"grad_norm": 0.000191491621080786,
"learning_rate": 3.9135694590972104e-08,
"loss": 0.0,
"num_input_tokens_seen": 1328000,
"step": 6200
},
{
"epoch": 19.6984126984127,
"grad_norm": 7.77663808548823e-05,
"learning_rate": 3.5357675783331825e-08,
"loss": 0.0,
"num_input_tokens_seen": 1329056,
"step": 6205
},
{
"epoch": 19.714285714285715,
"grad_norm": 0.00060555204981938,
"learning_rate": 3.177125838830786e-08,
"loss": 0.0,
"num_input_tokens_seen": 1330144,
"step": 6210
},
{
"epoch": 19.73015873015873,
"grad_norm": 0.00011826303671114147,
"learning_rate": 2.837646993134324e-08,
"loss": 0.0,
"num_input_tokens_seen": 1331168,
"step": 6215
},
{
"epoch": 19.746031746031747,
"grad_norm": 0.00021305291738826782,
"learning_rate": 2.5173336467135267e-08,
"loss": 0.0,
"num_input_tokens_seen": 1332288,
"step": 6220
},
{
"epoch": 19.761904761904763,
"grad_norm": 0.00013271687203086913,
"learning_rate": 2.2161882579446735e-08,
"loss": 0.0,
"num_input_tokens_seen": 1333408,
"step": 6225
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.0009385115699842572,
"learning_rate": 1.9342131380920005e-08,
"loss": 0.0,
"num_input_tokens_seen": 1334464,
"step": 6230
},
{
"epoch": 19.793650793650794,
"grad_norm": 0.00010659523832146078,
"learning_rate": 1.6714104512896568e-08,
"loss": 0.0,
"num_input_tokens_seen": 1335520,
"step": 6235
},
{
"epoch": 19.80952380952381,
"grad_norm": 0.0001576508511789143,
"learning_rate": 1.427782214524498e-08,
"loss": 0.0,
"num_input_tokens_seen": 1336624,
"step": 6240
},
{
"epoch": 19.825396825396826,
"grad_norm": 6.760272663086653e-05,
"learning_rate": 1.2033302976222071e-08,
"loss": 0.0,
"num_input_tokens_seen": 1337776,
"step": 6245
},
{
"epoch": 19.841269841269842,
"grad_norm": 0.000145541358506307,
"learning_rate": 9.980564232311973e-09,
"loss": 0.0,
"num_input_tokens_seen": 1338848,
"step": 6250
},
{
"epoch": 19.857142857142858,
"grad_norm": 0.0001003668294288218,
"learning_rate": 8.11962166809843e-09,
"loss": 0.0,
"num_input_tokens_seen": 1339920,
"step": 6255
},
{
"epoch": 19.873015873015873,
"grad_norm": 0.0012003045994788408,
"learning_rate": 6.450489566151019e-09,
"loss": 0.0,
"num_input_tokens_seen": 1341072,
"step": 6260
},
{
"epoch": 19.88888888888889,
"grad_norm": 0.00013508858683053404,
"learning_rate": 4.9731807369113316e-09,
"loss": 0.0,
"num_input_tokens_seen": 1342096,
"step": 6265
},
{
"epoch": 19.904761904761905,
"grad_norm": 0.00233248807489872,
"learning_rate": 3.687706518595846e-09,
"loss": 0.0,
"num_input_tokens_seen": 1343168,
"step": 6270
},
{
"epoch": 19.92063492063492,
"grad_norm": 0.00012182630598545074,
"learning_rate": 2.594076777104326e-09,
"loss": 0.0,
"num_input_tokens_seen": 1344208,
"step": 6275
},
{
"epoch": 19.936507936507937,
"grad_norm": 0.0001995829225052148,
"learning_rate": 1.692299905944883e-09,
"loss": 0.0,
"num_input_tokens_seen": 1345216,
"step": 6280
},
{
"epoch": 19.952380952380953,
"grad_norm": 7.580827514175326e-05,
"learning_rate": 9.823828261756873e-10,
"loss": 0.0,
"num_input_tokens_seen": 1346240,
"step": 6285
},
{
"epoch": 19.96825396825397,
"grad_norm": 0.00014859516522847116,
"learning_rate": 4.643309863494594e-10,
"loss": 0.0,
"num_input_tokens_seen": 1347312,
"step": 6290
},
{
"epoch": 19.984126984126984,
"grad_norm": 9.452983067603782e-05,
"learning_rate": 1.381483624662838e-10,
"loss": 0.0,
"num_input_tokens_seen": 1348352,
"step": 6295
},
{
"epoch": 20.0,
"grad_norm": 6.697617936879396e-05,
"learning_rate": 3.837457948629997e-12,
"loss": 0.0,
"num_input_tokens_seen": 1349424,
"step": 6300
},
{
"epoch": 20.0,
"eval_loss": 0.20304246246814728,
"eval_runtime": 1.458,
"eval_samples_per_second": 48.011,
"eval_steps_per_second": 24.005,
"num_input_tokens_seen": 1349424,
"step": 6300
},
{
"epoch": 20.0,
"num_input_tokens_seen": 1349424,
"step": 6300,
"total_flos": 6.076395282353357e+16,
"train_loss": 0.10612090195964623,
"train_runtime": 586.1962,
"train_samples_per_second": 21.495,
"train_steps_per_second": 10.747
}
],
"logging_steps": 5,
"max_steps": 6300,
"num_input_tokens_seen": 1349424,
"num_train_epochs": 20,
"save_steps": 315,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.076395282353357e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}