neginr's picture
End of training
d7a58aa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.955414012738854,
"eval_steps": 500,
"global_step": 364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01910828025477707,
"grad_norm": 0.3258431400870058,
"learning_rate": 5.405405405405406e-09,
"loss": 0.2596,
"step": 1
},
{
"epoch": 0.03821656050955414,
"grad_norm": 0.2865253531051695,
"learning_rate": 1.0810810810810811e-08,
"loss": 0.2512,
"step": 2
},
{
"epoch": 0.05732484076433121,
"grad_norm": 0.30892863941613385,
"learning_rate": 1.6216216216216218e-08,
"loss": 0.2584,
"step": 3
},
{
"epoch": 0.07643312101910828,
"grad_norm": 0.3145090623295482,
"learning_rate": 2.1621621621621623e-08,
"loss": 0.2562,
"step": 4
},
{
"epoch": 0.09554140127388536,
"grad_norm": 0.3250498078418424,
"learning_rate": 2.7027027027027028e-08,
"loss": 0.2557,
"step": 5
},
{
"epoch": 0.11464968152866242,
"grad_norm": 0.3048291606019023,
"learning_rate": 3.2432432432432436e-08,
"loss": 0.2429,
"step": 6
},
{
"epoch": 0.1337579617834395,
"grad_norm": 0.2813946488902139,
"learning_rate": 3.783783783783784e-08,
"loss": 0.244,
"step": 7
},
{
"epoch": 0.15286624203821655,
"grad_norm": 0.2951556246869404,
"learning_rate": 4.3243243243243246e-08,
"loss": 0.2475,
"step": 8
},
{
"epoch": 0.17197452229299362,
"grad_norm": 0.31375967109328035,
"learning_rate": 4.864864864864865e-08,
"loss": 0.2548,
"step": 9
},
{
"epoch": 0.1910828025477707,
"grad_norm": 0.320646972442393,
"learning_rate": 5.4054054054054056e-08,
"loss": 0.2593,
"step": 10
},
{
"epoch": 0.21019108280254778,
"grad_norm": 0.3101235439496355,
"learning_rate": 5.945945945945946e-08,
"loss": 0.2606,
"step": 11
},
{
"epoch": 0.22929936305732485,
"grad_norm": 0.29711763418174875,
"learning_rate": 6.486486486486487e-08,
"loss": 0.2517,
"step": 12
},
{
"epoch": 0.2484076433121019,
"grad_norm": 0.3071305484501633,
"learning_rate": 7.027027027027027e-08,
"loss": 0.2494,
"step": 13
},
{
"epoch": 0.267515923566879,
"grad_norm": 0.2845310181092547,
"learning_rate": 7.567567567567568e-08,
"loss": 0.2491,
"step": 14
},
{
"epoch": 0.28662420382165604,
"grad_norm": 0.3203987747997849,
"learning_rate": 8.108108108108108e-08,
"loss": 0.2543,
"step": 15
},
{
"epoch": 0.3057324840764331,
"grad_norm": 0.28456107510027123,
"learning_rate": 8.648648648648649e-08,
"loss": 0.258,
"step": 16
},
{
"epoch": 0.3248407643312102,
"grad_norm": 0.27941243424990353,
"learning_rate": 9.189189189189189e-08,
"loss": 0.2574,
"step": 17
},
{
"epoch": 0.34394904458598724,
"grad_norm": 0.2930471474625414,
"learning_rate": 9.72972972972973e-08,
"loss": 0.2743,
"step": 18
},
{
"epoch": 0.3630573248407643,
"grad_norm": 0.30494204356688037,
"learning_rate": 1.027027027027027e-07,
"loss": 0.269,
"step": 19
},
{
"epoch": 0.3821656050955414,
"grad_norm": 0.30231424553130715,
"learning_rate": 1.0810810810810811e-07,
"loss": 0.2557,
"step": 20
},
{
"epoch": 0.4012738853503185,
"grad_norm": 0.3015266138690226,
"learning_rate": 1.135135135135135e-07,
"loss": 0.2523,
"step": 21
},
{
"epoch": 0.42038216560509556,
"grad_norm": 0.3043409829723809,
"learning_rate": 1.1891891891891891e-07,
"loss": 0.2547,
"step": 22
},
{
"epoch": 0.4394904458598726,
"grad_norm": 0.29444734831015446,
"learning_rate": 1.2432432432432432e-07,
"loss": 0.2517,
"step": 23
},
{
"epoch": 0.4585987261146497,
"grad_norm": 0.297937003697346,
"learning_rate": 1.2972972972972974e-07,
"loss": 0.2434,
"step": 24
},
{
"epoch": 0.47770700636942676,
"grad_norm": 0.3027152740282303,
"learning_rate": 1.3513513513513512e-07,
"loss": 0.2538,
"step": 25
},
{
"epoch": 0.4968152866242038,
"grad_norm": 0.3085528948055293,
"learning_rate": 1.4054054054054055e-07,
"loss": 0.2455,
"step": 26
},
{
"epoch": 0.5159235668789809,
"grad_norm": 0.30288755432020104,
"learning_rate": 1.4594594594594595e-07,
"loss": 0.2642,
"step": 27
},
{
"epoch": 0.535031847133758,
"grad_norm": 0.2888257868462652,
"learning_rate": 1.5135135135135135e-07,
"loss": 0.2535,
"step": 28
},
{
"epoch": 0.554140127388535,
"grad_norm": 0.31240869601074167,
"learning_rate": 1.5675675675675675e-07,
"loss": 0.2574,
"step": 29
},
{
"epoch": 0.5732484076433121,
"grad_norm": 0.28779468390754065,
"learning_rate": 1.6216216216216215e-07,
"loss": 0.2562,
"step": 30
},
{
"epoch": 0.5923566878980892,
"grad_norm": 0.3026300082985163,
"learning_rate": 1.6756756756756755e-07,
"loss": 0.2489,
"step": 31
},
{
"epoch": 0.6114649681528662,
"grad_norm": 0.29410215399776,
"learning_rate": 1.7297297297297298e-07,
"loss": 0.2513,
"step": 32
},
{
"epoch": 0.6305732484076433,
"grad_norm": 0.2967099409221429,
"learning_rate": 1.7837837837837836e-07,
"loss": 0.2605,
"step": 33
},
{
"epoch": 0.6496815286624203,
"grad_norm": 0.28890784109752826,
"learning_rate": 1.8378378378378379e-07,
"loss": 0.2488,
"step": 34
},
{
"epoch": 0.6687898089171974,
"grad_norm": 0.3228375228965286,
"learning_rate": 1.891891891891892e-07,
"loss": 0.2629,
"step": 35
},
{
"epoch": 0.6878980891719745,
"grad_norm": 0.30208391730174905,
"learning_rate": 1.945945945945946e-07,
"loss": 0.2662,
"step": 36
},
{
"epoch": 0.7070063694267515,
"grad_norm": 0.2895031799209941,
"learning_rate": 2e-07,
"loss": 0.2536,
"step": 37
},
{
"epoch": 0.7261146496815286,
"grad_norm": 0.3036410296650111,
"learning_rate": 1.999953850085163e-07,
"loss": 0.2654,
"step": 38
},
{
"epoch": 0.7452229299363057,
"grad_norm": 0.30128310490546484,
"learning_rate": 1.999815404600282e-07,
"loss": 0.2471,
"step": 39
},
{
"epoch": 0.7643312101910829,
"grad_norm": 0.3079903978462585,
"learning_rate": 1.999584676323851e-07,
"loss": 0.256,
"step": 40
},
{
"epoch": 0.7834394904458599,
"grad_norm": 0.2917481324310926,
"learning_rate": 1.9992616865520512e-07,
"loss": 0.2499,
"step": 41
},
{
"epoch": 0.802547770700637,
"grad_norm": 0.31471272570274456,
"learning_rate": 1.998846465096783e-07,
"loss": 0.2641,
"step": 42
},
{
"epoch": 0.821656050955414,
"grad_norm": 0.3152199748266918,
"learning_rate": 1.9983390502829166e-07,
"loss": 0.2593,
"step": 43
},
{
"epoch": 0.8407643312101911,
"grad_norm": 0.29879027476626574,
"learning_rate": 1.9977394889447523e-07,
"loss": 0.2686,
"step": 44
},
{
"epoch": 0.8598726114649682,
"grad_norm": 0.3172098054667475,
"learning_rate": 1.9970478364216996e-07,
"loss": 0.2408,
"step": 45
},
{
"epoch": 0.8789808917197452,
"grad_norm": 0.29042679625683193,
"learning_rate": 1.996264156553169e-07,
"loss": 0.2664,
"step": 46
},
{
"epoch": 0.8980891719745223,
"grad_norm": 0.30019512901289386,
"learning_rate": 1.9953885216726785e-07,
"loss": 0.2508,
"step": 47
},
{
"epoch": 0.9171974522292994,
"grad_norm": 0.28586435210675887,
"learning_rate": 1.9944210126011788e-07,
"loss": 0.2522,
"step": 48
},
{
"epoch": 0.9363057324840764,
"grad_norm": 0.30773911686783834,
"learning_rate": 1.9933617186395914e-07,
"loss": 0.2502,
"step": 49
},
{
"epoch": 0.9554140127388535,
"grad_norm": 0.31371477716041124,
"learning_rate": 1.9922107375605698e-07,
"loss": 0.2619,
"step": 50
},
{
"epoch": 0.9745222929936306,
"grad_norm": 0.2950309178219575,
"learning_rate": 1.990968175599471e-07,
"loss": 0.2613,
"step": 51
},
{
"epoch": 0.9936305732484076,
"grad_norm": 0.3320695491566013,
"learning_rate": 1.9896341474445524e-07,
"loss": 0.2541,
"step": 52
},
{
"epoch": 1.0127388535031847,
"grad_norm": 0.2902283899600383,
"learning_rate": 1.9882087762263852e-07,
"loss": 0.2485,
"step": 53
},
{
"epoch": 1.0318471337579618,
"grad_norm": 0.30704788847785364,
"learning_rate": 1.9866921935064905e-07,
"loss": 0.2579,
"step": 54
},
{
"epoch": 1.0509554140127388,
"grad_norm": 0.2786908392929887,
"learning_rate": 1.9850845392651947e-07,
"loss": 0.2638,
"step": 55
},
{
"epoch": 1.070063694267516,
"grad_norm": 0.317768378195936,
"learning_rate": 1.983385961888711e-07,
"loss": 0.2483,
"step": 56
},
{
"epoch": 1.089171974522293,
"grad_norm": 0.2999290392989265,
"learning_rate": 1.981596618155441e-07,
"loss": 0.2604,
"step": 57
},
{
"epoch": 1.10828025477707,
"grad_norm": 0.31058999539375254,
"learning_rate": 1.9797166732215075e-07,
"loss": 0.2619,
"step": 58
},
{
"epoch": 1.127388535031847,
"grad_norm": 0.3064782254040447,
"learning_rate": 1.977746300605507e-07,
"loss": 0.2607,
"step": 59
},
{
"epoch": 1.1464968152866242,
"grad_norm": 0.3002412022740287,
"learning_rate": 1.9756856821724967e-07,
"loss": 0.2617,
"step": 60
},
{
"epoch": 1.1656050955414012,
"grad_norm": 0.29772159876550963,
"learning_rate": 1.9735350081172067e-07,
"loss": 0.2481,
"step": 61
},
{
"epoch": 1.1847133757961783,
"grad_norm": 0.31356783559096124,
"learning_rate": 1.9712944769464862e-07,
"loss": 0.2587,
"step": 62
},
{
"epoch": 1.2038216560509554,
"grad_norm": 0.2888581149656851,
"learning_rate": 1.9689642954609806e-07,
"loss": 0.2538,
"step": 63
},
{
"epoch": 1.2229299363057324,
"grad_norm": 0.30385671289298677,
"learning_rate": 1.966544678736044e-07,
"loss": 0.2527,
"step": 64
},
{
"epoch": 1.2420382165605095,
"grad_norm": 0.2973647308882802,
"learning_rate": 1.9640358501018882e-07,
"loss": 0.2479,
"step": 65
},
{
"epoch": 1.2611464968152866,
"grad_norm": 0.3095848016456744,
"learning_rate": 1.961438041122969e-07,
"loss": 0.2482,
"step": 66
},
{
"epoch": 1.2802547770700636,
"grad_norm": 0.3515887000957586,
"learning_rate": 1.9587514915766122e-07,
"loss": 0.253,
"step": 67
},
{
"epoch": 1.2993630573248407,
"grad_norm": 0.2931897201883793,
"learning_rate": 1.9559764494308834e-07,
"loss": 0.2457,
"step": 68
},
{
"epoch": 1.3184713375796178,
"grad_norm": 0.31283649147238096,
"learning_rate": 1.9531131708217004e-07,
"loss": 0.2554,
"step": 69
},
{
"epoch": 1.3375796178343948,
"grad_norm": 0.30295461967215936,
"learning_rate": 1.9501619200291905e-07,
"loss": 0.2561,
"step": 70
},
{
"epoch": 1.356687898089172,
"grad_norm": 0.2910098967517895,
"learning_rate": 1.9471229694533e-07,
"loss": 0.264,
"step": 71
},
{
"epoch": 1.3757961783439492,
"grad_norm": 0.29074377848027877,
"learning_rate": 1.9439965995886488e-07,
"loss": 0.2579,
"step": 72
},
{
"epoch": 1.394904458598726,
"grad_norm": 0.2913116920333876,
"learning_rate": 1.9407830989986428e-07,
"loss": 0.2515,
"step": 73
},
{
"epoch": 1.4140127388535033,
"grad_norm": 0.28025336367518355,
"learning_rate": 1.9374827642888395e-07,
"loss": 0.2592,
"step": 74
},
{
"epoch": 1.4331210191082802,
"grad_norm": 0.308381777961692,
"learning_rate": 1.9340959000795706e-07,
"loss": 0.2559,
"step": 75
},
{
"epoch": 1.4522292993630574,
"grad_norm": 0.29320496586078204,
"learning_rate": 1.9306228189778253e-07,
"loss": 0.2658,
"step": 76
},
{
"epoch": 1.4713375796178343,
"grad_norm": 0.32221369887070983,
"learning_rate": 1.927063841548398e-07,
"loss": 0.2646,
"step": 77
},
{
"epoch": 1.4904458598726116,
"grad_norm": 0.299728133006886,
"learning_rate": 1.923419296284299e-07,
"loss": 0.2581,
"step": 78
},
{
"epoch": 1.5095541401273884,
"grad_norm": 0.3209108114734401,
"learning_rate": 1.919689519576436e-07,
"loss": 0.2567,
"step": 79
},
{
"epoch": 1.5286624203821657,
"grad_norm": 0.2804129315244473,
"learning_rate": 1.9158748556825634e-07,
"loss": 0.253,
"step": 80
},
{
"epoch": 1.5477707006369426,
"grad_norm": 0.3045816243631572,
"learning_rate": 1.911975656695509e-07,
"loss": 0.2584,
"step": 81
},
{
"epoch": 1.5668789808917198,
"grad_norm": 0.3126990342175922,
"learning_rate": 1.907992282510675e-07,
"loss": 0.2605,
"step": 82
},
{
"epoch": 1.5859872611464967,
"grad_norm": 0.30614700686477747,
"learning_rate": 1.90392510079282e-07,
"loss": 0.2548,
"step": 83
},
{
"epoch": 1.605095541401274,
"grad_norm": 0.2971554730236426,
"learning_rate": 1.8997744869421245e-07,
"loss": 0.2354,
"step": 84
},
{
"epoch": 1.6242038216560508,
"grad_norm": 0.2988208824377952,
"learning_rate": 1.8955408240595392e-07,
"loss": 0.2441,
"step": 85
},
{
"epoch": 1.643312101910828,
"grad_norm": 0.2947992742232842,
"learning_rate": 1.8912245029114278e-07,
"loss": 0.2503,
"step": 86
},
{
"epoch": 1.662420382165605,
"grad_norm": 0.3147211833492056,
"learning_rate": 1.8868259218934966e-07,
"loss": 0.2529,
"step": 87
},
{
"epoch": 1.6815286624203822,
"grad_norm": 0.3150826072001003,
"learning_rate": 1.882345486994024e-07,
"loss": 0.2492,
"step": 88
},
{
"epoch": 1.700636942675159,
"grad_norm": 0.29922927789546194,
"learning_rate": 1.877783611756389e-07,
"loss": 0.2657,
"step": 89
},
{
"epoch": 1.7197452229299364,
"grad_norm": 0.2841068444708262,
"learning_rate": 1.8731407172408987e-07,
"loss": 0.2503,
"step": 90
},
{
"epoch": 1.7388535031847132,
"grad_norm": 0.3000862442323796,
"learning_rate": 1.8684172319859257e-07,
"loss": 0.2553,
"step": 91
},
{
"epoch": 1.7579617834394905,
"grad_norm": 0.3058443181160777,
"learning_rate": 1.863613591968355e-07,
"loss": 0.2572,
"step": 92
},
{
"epoch": 1.7770700636942676,
"grad_norm": 0.31867483871696056,
"learning_rate": 1.8587302405633417e-07,
"loss": 0.2527,
"step": 93
},
{
"epoch": 1.7961783439490446,
"grad_norm": 0.3025042298897706,
"learning_rate": 1.8537676285033885e-07,
"loss": 0.2595,
"step": 94
},
{
"epoch": 1.8152866242038217,
"grad_norm": 0.2923220305017809,
"learning_rate": 1.848726213836744e-07,
"loss": 0.2577,
"step": 95
},
{
"epoch": 1.8343949044585988,
"grad_norm": 0.3003765605795671,
"learning_rate": 1.8436064618851224e-07,
"loss": 0.2521,
"step": 96
},
{
"epoch": 1.8535031847133758,
"grad_norm": 0.317610406351691,
"learning_rate": 1.8384088452007576e-07,
"loss": 0.2502,
"step": 97
},
{
"epoch": 1.872611464968153,
"grad_norm": 0.2887579767892906,
"learning_rate": 1.8331338435227837e-07,
"loss": 0.2586,
"step": 98
},
{
"epoch": 1.89171974522293,
"grad_norm": 0.3197071782148506,
"learning_rate": 1.8277819437329574e-07,
"loss": 0.267,
"step": 99
},
{
"epoch": 1.910828025477707,
"grad_norm": 0.2849069609408447,
"learning_rate": 1.8223536398107174e-07,
"loss": 0.2485,
"step": 100
},
{
"epoch": 1.929936305732484,
"grad_norm": 0.2887703170078543,
"learning_rate": 1.8168494327875916e-07,
"loss": 0.2499,
"step": 101
},
{
"epoch": 1.9490445859872612,
"grad_norm": 0.3161969478865281,
"learning_rate": 1.8112698307009504e-07,
"loss": 0.2505,
"step": 102
},
{
"epoch": 1.9681528662420382,
"grad_norm": 0.287901366472073,
"learning_rate": 1.8056153485471165e-07,
"loss": 0.2668,
"step": 103
},
{
"epoch": 1.9872611464968153,
"grad_norm": 0.3012560634417445,
"learning_rate": 1.7998865082338287e-07,
"loss": 0.2565,
"step": 104
},
{
"epoch": 2.0063694267515926,
"grad_norm": 0.29276615150168056,
"learning_rate": 1.7940838385320732e-07,
"loss": 0.2533,
"step": 105
},
{
"epoch": 2.0254777070063694,
"grad_norm": 0.30402861564543393,
"learning_rate": 1.788207875027274e-07,
"loss": 0.2525,
"step": 106
},
{
"epoch": 2.0445859872611467,
"grad_norm": 0.2967173967659541,
"learning_rate": 1.7822591600698629e-07,
"loss": 0.2532,
"step": 107
},
{
"epoch": 2.0636942675159236,
"grad_norm": 0.30366484369127167,
"learning_rate": 1.7762382427252165e-07,
"loss": 0.2573,
"step": 108
},
{
"epoch": 2.082802547770701,
"grad_norm": 0.2878188158407926,
"learning_rate": 1.7701456787229803e-07,
"loss": 0.2602,
"step": 109
},
{
"epoch": 2.1019108280254777,
"grad_norm": 0.2743059722128589,
"learning_rate": 1.7639820304057742e-07,
"loss": 0.2554,
"step": 110
},
{
"epoch": 2.121019108280255,
"grad_norm": 0.29808343098853324,
"learning_rate": 1.7577478666772882e-07,
"loss": 0.253,
"step": 111
},
{
"epoch": 2.140127388535032,
"grad_norm": 0.2875587315242635,
"learning_rate": 1.7514437629497717e-07,
"loss": 0.2488,
"step": 112
},
{
"epoch": 2.159235668789809,
"grad_norm": 0.30351361854827635,
"learning_rate": 1.7450703010909262e-07,
"loss": 0.2562,
"step": 113
},
{
"epoch": 2.178343949044586,
"grad_norm": 0.30394129590401453,
"learning_rate": 1.738628069370195e-07,
"loss": 0.2607,
"step": 114
},
{
"epoch": 2.1974522292993632,
"grad_norm": 0.32285950783403167,
"learning_rate": 1.7321176624044687e-07,
"loss": 0.2503,
"step": 115
},
{
"epoch": 2.21656050955414,
"grad_norm": 0.3120864765033612,
"learning_rate": 1.7255396811032013e-07,
"loss": 0.2509,
"step": 116
},
{
"epoch": 2.2356687898089174,
"grad_norm": 0.3236920273170069,
"learning_rate": 1.718894732612947e-07,
"loss": 0.2502,
"step": 117
},
{
"epoch": 2.254777070063694,
"grad_norm": 0.29570882874175936,
"learning_rate": 1.7121834302613186e-07,
"loss": 0.2639,
"step": 118
},
{
"epoch": 2.2738853503184715,
"grad_norm": 0.31678397348330944,
"learning_rate": 1.7054063935003812e-07,
"loss": 0.2496,
"step": 119
},
{
"epoch": 2.2929936305732483,
"grad_norm": 0.2988770843936996,
"learning_rate": 1.6985642478494727e-07,
"loss": 0.2507,
"step": 120
},
{
"epoch": 2.3121019108280256,
"grad_norm": 0.3109434916642981,
"learning_rate": 1.6916576248374716e-07,
"loss": 0.2616,
"step": 121
},
{
"epoch": 2.3312101910828025,
"grad_norm": 0.3261007418875524,
"learning_rate": 1.684687161944506e-07,
"loss": 0.2554,
"step": 122
},
{
"epoch": 2.3503184713375798,
"grad_norm": 0.31440782489796776,
"learning_rate": 1.6776535025431129e-07,
"loss": 0.2559,
"step": 123
},
{
"epoch": 2.3694267515923566,
"grad_norm": 0.30410776957586444,
"learning_rate": 1.6705572958388573e-07,
"loss": 0.2603,
"step": 124
},
{
"epoch": 2.388535031847134,
"grad_norm": 0.30666376843334026,
"learning_rate": 1.6633991968104092e-07,
"loss": 0.2439,
"step": 125
},
{
"epoch": 2.4076433121019107,
"grad_norm": 0.3133700400691411,
"learning_rate": 1.6561798661490902e-07,
"loss": 0.2514,
"step": 126
},
{
"epoch": 2.426751592356688,
"grad_norm": 0.30312163468190945,
"learning_rate": 1.6488999701978902e-07,
"loss": 0.2522,
"step": 127
},
{
"epoch": 2.445859872611465,
"grad_norm": 0.2980517023756582,
"learning_rate": 1.6415601808899658e-07,
"loss": 0.2634,
"step": 128
},
{
"epoch": 2.464968152866242,
"grad_norm": 0.29620642402954184,
"learning_rate": 1.63416117568662e-07,
"loss": 0.2523,
"step": 129
},
{
"epoch": 2.484076433121019,
"grad_norm": 0.30847707783645134,
"learning_rate": 1.6267036375147723e-07,
"loss": 0.2519,
"step": 130
},
{
"epoch": 2.5031847133757963,
"grad_norm": 0.3150233916320143,
"learning_rate": 1.6191882547039266e-07,
"loss": 0.257,
"step": 131
},
{
"epoch": 2.522292993630573,
"grad_norm": 0.27358778066993983,
"learning_rate": 1.6116157209226352e-07,
"loss": 0.2647,
"step": 132
},
{
"epoch": 2.5414012738853504,
"grad_norm": 0.3019435375253268,
"learning_rate": 1.6039867351144777e-07,
"loss": 0.2647,
"step": 133
},
{
"epoch": 2.5605095541401273,
"grad_norm": 0.30895517117616256,
"learning_rate": 1.5963020014335436e-07,
"loss": 0.2559,
"step": 134
},
{
"epoch": 2.5796178343949046,
"grad_norm": 0.31282726024916446,
"learning_rate": 1.5885622291794428e-07,
"loss": 0.2473,
"step": 135
},
{
"epoch": 2.5987261146496814,
"grad_norm": 0.29386800602210383,
"learning_rate": 1.580768132731837e-07,
"loss": 0.2564,
"step": 136
},
{
"epoch": 2.6178343949044587,
"grad_norm": 0.2994659441023476,
"learning_rate": 1.5729204314845e-07,
"loss": 0.2593,
"step": 137
},
{
"epoch": 2.6369426751592355,
"grad_norm": 0.306461349101831,
"learning_rate": 1.56501984977892e-07,
"loss": 0.2618,
"step": 138
},
{
"epoch": 2.656050955414013,
"grad_norm": 0.28736145386635525,
"learning_rate": 1.5570671168374436e-07,
"loss": 0.2702,
"step": 139
},
{
"epoch": 2.6751592356687897,
"grad_norm": 0.2862972968768037,
"learning_rate": 1.5490629666959666e-07,
"loss": 0.2604,
"step": 140
},
{
"epoch": 2.694267515923567,
"grad_norm": 0.3188610660653441,
"learning_rate": 1.5410081381361829e-07,
"loss": 0.2671,
"step": 141
},
{
"epoch": 2.713375796178344,
"grad_norm": 0.3094794846413303,
"learning_rate": 1.5329033746173973e-07,
"loss": 0.2535,
"step": 142
},
{
"epoch": 2.732484076433121,
"grad_norm": 0.29891576281966575,
"learning_rate": 1.5247494242079021e-07,
"loss": 0.252,
"step": 143
},
{
"epoch": 2.7515923566878984,
"grad_norm": 0.292028133356272,
"learning_rate": 1.5165470395159313e-07,
"loss": 0.2517,
"step": 144
},
{
"epoch": 2.770700636942675,
"grad_norm": 0.3015119959336549,
"learning_rate": 1.5082969776201945e-07,
"loss": 0.2485,
"step": 145
},
{
"epoch": 2.789808917197452,
"grad_norm": 0.30946055407183126,
"learning_rate": 1.5e-07,
"loss": 0.2549,
"step": 146
},
{
"epoch": 2.8089171974522293,
"grad_norm": 0.3011288987792503,
"learning_rate": 1.4916568724649686e-07,
"loss": 0.2526,
"step": 147
},
{
"epoch": 2.8280254777070066,
"grad_norm": 0.27935366900331016,
"learning_rate": 1.4832683650843506e-07,
"loss": 0.2569,
"step": 148
},
{
"epoch": 2.8471337579617835,
"grad_norm": 0.3053623378204874,
"learning_rate": 1.4748352521159491e-07,
"loss": 0.2543,
"step": 149
},
{
"epoch": 2.8662420382165603,
"grad_norm": 0.30986894267799603,
"learning_rate": 1.4663583119346538e-07,
"loss": 0.2414,
"step": 150
},
{
"epoch": 2.8853503184713376,
"grad_norm": 0.3029701849877518,
"learning_rate": 1.4578383269606002e-07,
"loss": 0.2645,
"step": 151
},
{
"epoch": 2.904458598726115,
"grad_norm": 0.3007935845890958,
"learning_rate": 1.4492760835869502e-07,
"loss": 0.2524,
"step": 152
},
{
"epoch": 2.9235668789808917,
"grad_norm": 0.3027359482846415,
"learning_rate": 1.4406723721073087e-07,
"loss": 0.2399,
"step": 153
},
{
"epoch": 2.9426751592356686,
"grad_norm": 0.286593283803398,
"learning_rate": 1.4320279866427796e-07,
"loss": 0.2491,
"step": 154
},
{
"epoch": 2.961783439490446,
"grad_norm": 0.2975044719847865,
"learning_rate": 1.4233437250686693e-07,
"loss": 0.2556,
"step": 155
},
{
"epoch": 2.980891719745223,
"grad_norm": 0.29788132002693174,
"learning_rate": 1.4146203889408418e-07,
"loss": 0.242,
"step": 156
},
{
"epoch": 3.0,
"grad_norm": 0.2947537039751411,
"learning_rate": 1.4058587834217354e-07,
"loss": 0.2584,
"step": 157
},
{
"epoch": 3.0191082802547773,
"grad_norm": 0.29945926345973045,
"learning_rate": 1.397059717206048e-07,
"loss": 0.2591,
"step": 158
},
{
"epoch": 3.038216560509554,
"grad_norm": 0.3078621269226234,
"learning_rate": 1.3882240024460924e-07,
"loss": 0.2587,
"step": 159
},
{
"epoch": 3.0573248407643314,
"grad_norm": 0.30621687808211867,
"learning_rate": 1.3793524546768356e-07,
"loss": 0.2603,
"step": 160
},
{
"epoch": 3.0764331210191083,
"grad_norm": 0.30958764885598344,
"learning_rate": 1.370445892740626e-07,
"loss": 0.2594,
"step": 161
},
{
"epoch": 3.0955414012738856,
"grad_norm": 0.3117099403903537,
"learning_rate": 1.361505138711613e-07,
"loss": 0.2538,
"step": 162
},
{
"epoch": 3.1146496815286624,
"grad_norm": 0.31084130617285904,
"learning_rate": 1.3525310178198706e-07,
"loss": 0.2658,
"step": 163
},
{
"epoch": 3.1337579617834397,
"grad_norm": 0.30690130397546955,
"learning_rate": 1.343524358375229e-07,
"loss": 0.2495,
"step": 164
},
{
"epoch": 3.1528662420382165,
"grad_norm": 0.290775795962111,
"learning_rate": 1.3344859916908204e-07,
"loss": 0.2574,
"step": 165
},
{
"epoch": 3.171974522292994,
"grad_norm": 0.31192756387768816,
"learning_rate": 1.325416752006351e-07,
"loss": 0.2548,
"step": 166
},
{
"epoch": 3.1910828025477707,
"grad_norm": 0.2970090514175917,
"learning_rate": 1.3163174764110982e-07,
"loss": 0.248,
"step": 167
},
{
"epoch": 3.210191082802548,
"grad_norm": 0.2904565168466135,
"learning_rate": 1.3071890047666496e-07,
"loss": 0.2469,
"step": 168
},
{
"epoch": 3.229299363057325,
"grad_norm": 0.30572059607999896,
"learning_rate": 1.2980321796293835e-07,
"loss": 0.2433,
"step": 169
},
{
"epoch": 3.248407643312102,
"grad_norm": 0.3080588666919199,
"learning_rate": 1.288847846172701e-07,
"loss": 0.2455,
"step": 170
},
{
"epoch": 3.267515923566879,
"grad_norm": 0.33087482625160974,
"learning_rate": 1.2796368521090143e-07,
"loss": 0.2563,
"step": 171
},
{
"epoch": 3.286624203821656,
"grad_norm": 0.29813533166624945,
"learning_rate": 1.270400047611508e-07,
"loss": 0.2486,
"step": 172
},
{
"epoch": 3.305732484076433,
"grad_norm": 0.3086312021030327,
"learning_rate": 1.261138285235663e-07,
"loss": 0.2458,
"step": 173
},
{
"epoch": 3.3248407643312103,
"grad_norm": 0.29331773923085075,
"learning_rate": 1.2518524198405698e-07,
"loss": 0.2709,
"step": 174
},
{
"epoch": 3.343949044585987,
"grad_norm": 0.3389129034608298,
"learning_rate": 1.2425433085100222e-07,
"loss": 0.249,
"step": 175
},
{
"epoch": 3.3630573248407645,
"grad_norm": 0.27583808400445303,
"learning_rate": 1.2332118104734109e-07,
"loss": 0.2593,
"step": 176
},
{
"epoch": 3.3821656050955413,
"grad_norm": 0.30026419416491845,
"learning_rate": 1.223858787026415e-07,
"loss": 0.2571,
"step": 177
},
{
"epoch": 3.4012738853503186,
"grad_norm": 0.2870756139104833,
"learning_rate": 1.2144851014515054e-07,
"loss": 0.2433,
"step": 178
},
{
"epoch": 3.4203821656050954,
"grad_norm": 0.2872438134366015,
"learning_rate": 1.2050916189382645e-07,
"loss": 0.2612,
"step": 179
},
{
"epoch": 3.4394904458598727,
"grad_norm": 0.3163865718075819,
"learning_rate": 1.195679206503528e-07,
"loss": 0.2549,
"step": 180
},
{
"epoch": 3.4585987261146496,
"grad_norm": 0.2978921734506274,
"learning_rate": 1.1862487329113604e-07,
"loss": 0.2622,
"step": 181
},
{
"epoch": 3.477707006369427,
"grad_norm": 0.2948938802787196,
"learning_rate": 1.1768010685928685e-07,
"loss": 0.2556,
"step": 182
},
{
"epoch": 3.4968152866242037,
"grad_norm": 0.31770785254670114,
"learning_rate": 1.1673370855658591e-07,
"loss": 0.2564,
"step": 183
},
{
"epoch": 3.515923566878981,
"grad_norm": 0.3077187664088862,
"learning_rate": 1.1578576573543539e-07,
"loss": 0.2603,
"step": 184
},
{
"epoch": 3.535031847133758,
"grad_norm": 0.27940138197404885,
"learning_rate": 1.1483636589079626e-07,
"loss": 0.2537,
"step": 185
},
{
"epoch": 3.554140127388535,
"grad_norm": 0.30920602027740407,
"learning_rate": 1.138855966521124e-07,
"loss": 0.2605,
"step": 186
},
{
"epoch": 3.573248407643312,
"grad_norm": 0.2878030111034016,
"learning_rate": 1.1293354577522263e-07,
"loss": 0.2642,
"step": 187
},
{
"epoch": 3.5923566878980893,
"grad_norm": 0.31242615729718354,
"learning_rate": 1.1198030113426074e-07,
"loss": 0.2689,
"step": 188
},
{
"epoch": 3.611464968152866,
"grad_norm": 0.2948055240186696,
"learning_rate": 1.110259507135447e-07,
"loss": 0.268,
"step": 189
},
{
"epoch": 3.6305732484076434,
"grad_norm": 0.2983537474983395,
"learning_rate": 1.1007058259945583e-07,
"loss": 0.25,
"step": 190
},
{
"epoch": 3.6496815286624202,
"grad_norm": 0.30237646104024896,
"learning_rate": 1.0911428497230832e-07,
"loss": 0.2398,
"step": 191
},
{
"epoch": 3.6687898089171975,
"grad_norm": 0.29700896373492647,
"learning_rate": 1.0815714609821025e-07,
"loss": 0.2568,
"step": 192
},
{
"epoch": 3.6878980891719744,
"grad_norm": 0.30679225984995734,
"learning_rate": 1.071992543209167e-07,
"loss": 0.2401,
"step": 193
},
{
"epoch": 3.7070063694267517,
"grad_norm": 0.3007632644983776,
"learning_rate": 1.0624069805367557e-07,
"loss": 0.2477,
"step": 194
},
{
"epoch": 3.7261146496815285,
"grad_norm": 0.3112671485408903,
"learning_rate": 1.0528156577106702e-07,
"loss": 0.2463,
"step": 195
},
{
"epoch": 3.745222929936306,
"grad_norm": 0.30401064479289264,
"learning_rate": 1.0432194600083739e-07,
"loss": 0.2574,
"step": 196
},
{
"epoch": 3.7643312101910826,
"grad_norm": 0.2851374502628386,
"learning_rate": 1.0336192731572803e-07,
"loss": 0.2582,
"step": 197
},
{
"epoch": 3.78343949044586,
"grad_norm": 0.2914379090084522,
"learning_rate": 1.0240159832530007e-07,
"loss": 0.2555,
"step": 198
},
{
"epoch": 3.802547770700637,
"grad_norm": 0.33178330655356186,
"learning_rate": 1.0144104766775572e-07,
"loss": 0.2614,
"step": 199
},
{
"epoch": 3.821656050955414,
"grad_norm": 0.31640502862571734,
"learning_rate": 1.0048036400175708e-07,
"loss": 0.235,
"step": 200
},
{
"epoch": 3.840764331210191,
"grad_norm": 0.3161044897697338,
"learning_rate": 9.951963599824293e-08,
"loss": 0.2433,
"step": 201
},
{
"epoch": 3.859872611464968,
"grad_norm": 0.2960452451709703,
"learning_rate": 9.855895233224429e-08,
"loss": 0.2589,
"step": 202
},
{
"epoch": 3.8789808917197455,
"grad_norm": 0.2881678617442765,
"learning_rate": 9.759840167469994e-08,
"loss": 0.257,
"step": 203
},
{
"epoch": 3.8980891719745223,
"grad_norm": 0.3020128060320192,
"learning_rate": 9.663807268427197e-08,
"loss": 0.2489,
"step": 204
},
{
"epoch": 3.917197452229299,
"grad_norm": 0.303673255967667,
"learning_rate": 9.567805399916259e-08,
"loss": 0.2552,
"step": 205
},
{
"epoch": 3.9363057324840764,
"grad_norm": 0.3029517107073107,
"learning_rate": 9.471843422893297e-08,
"loss": 0.2545,
"step": 206
},
{
"epoch": 3.9554140127388537,
"grad_norm": 0.29991781394420536,
"learning_rate": 9.375930194632446e-08,
"loss": 0.2541,
"step": 207
},
{
"epoch": 3.9745222929936306,
"grad_norm": 0.29801939082138607,
"learning_rate": 9.28007456790833e-08,
"loss": 0.2651,
"step": 208
},
{
"epoch": 3.9936305732484074,
"grad_norm": 0.3107836552290268,
"learning_rate": 9.184285390178977e-08,
"loss": 0.245,
"step": 209
},
{
"epoch": 4.012738853503185,
"grad_norm": 0.31402034579974897,
"learning_rate": 9.088571502769167e-08,
"loss": 0.261,
"step": 210
},
{
"epoch": 4.031847133757962,
"grad_norm": 0.29505379895390366,
"learning_rate": 8.992941740054417e-08,
"loss": 0.2435,
"step": 211
},
{
"epoch": 4.050955414012739,
"grad_norm": 0.29509128808113044,
"learning_rate": 8.897404928645527e-08,
"loss": 0.2445,
"step": 212
},
{
"epoch": 4.070063694267516,
"grad_norm": 0.29570344259163295,
"learning_rate": 8.801969886573929e-08,
"loss": 0.2698,
"step": 213
},
{
"epoch": 4.089171974522293,
"grad_norm": 0.29346662152654746,
"learning_rate": 8.706645422477737e-08,
"loss": 0.2597,
"step": 214
},
{
"epoch": 4.10828025477707,
"grad_norm": 0.2805185175645672,
"learning_rate": 8.611440334788762e-08,
"loss": 0.2586,
"step": 215
},
{
"epoch": 4.127388535031847,
"grad_norm": 0.30742002754181397,
"learning_rate": 8.516363410920375e-08,
"loss": 0.2478,
"step": 216
},
{
"epoch": 4.146496815286624,
"grad_norm": 0.2850884458824681,
"learning_rate": 8.42142342645646e-08,
"loss": 0.2602,
"step": 217
},
{
"epoch": 4.165605095541402,
"grad_norm": 0.306269262929328,
"learning_rate": 8.326629144341405e-08,
"loss": 0.2515,
"step": 218
},
{
"epoch": 4.1847133757961785,
"grad_norm": 0.3018034076236653,
"learning_rate": 8.231989314071316e-08,
"loss": 0.2471,
"step": 219
},
{
"epoch": 4.203821656050955,
"grad_norm": 0.3185658018423822,
"learning_rate": 8.137512670886396e-08,
"loss": 0.2615,
"step": 220
},
{
"epoch": 4.222929936305732,
"grad_norm": 0.29628833966395685,
"learning_rate": 8.04320793496472e-08,
"loss": 0.2577,
"step": 221
},
{
"epoch": 4.24203821656051,
"grad_norm": 0.2922501668352887,
"learning_rate": 7.949083810617357e-08,
"loss": 0.2705,
"step": 222
},
{
"epoch": 4.261146496815287,
"grad_norm": 0.2961392897242003,
"learning_rate": 7.855148985484945e-08,
"loss": 0.2499,
"step": 223
},
{
"epoch": 4.280254777070064,
"grad_norm": 0.2953084567404899,
"learning_rate": 7.761412129735851e-08,
"loss": 0.2529,
"step": 224
},
{
"epoch": 4.2993630573248405,
"grad_norm": 0.31348462098681923,
"learning_rate": 7.667881895265893e-08,
"loss": 0.2458,
"step": 225
},
{
"epoch": 4.318471337579618,
"grad_norm": 0.30412264596940947,
"learning_rate": 7.574566914899778e-08,
"loss": 0.2548,
"step": 226
},
{
"epoch": 4.337579617834395,
"grad_norm": 0.2847001566971413,
"learning_rate": 7.481475801594301e-08,
"loss": 0.2534,
"step": 227
},
{
"epoch": 4.356687898089172,
"grad_norm": 0.2853350717625523,
"learning_rate": 7.38861714764337e-08,
"loss": 0.2422,
"step": 228
},
{
"epoch": 4.375796178343949,
"grad_norm": 0.29979993619083456,
"learning_rate": 7.29599952388492e-08,
"loss": 0.2592,
"step": 229
},
{
"epoch": 4.3949044585987265,
"grad_norm": 0.3007849327371836,
"learning_rate": 7.203631478909857e-08,
"loss": 0.2487,
"step": 230
},
{
"epoch": 4.414012738853503,
"grad_norm": 0.2964965887396194,
"learning_rate": 7.111521538272996e-08,
"loss": 0.2591,
"step": 231
},
{
"epoch": 4.43312101910828,
"grad_norm": 0.29153090184611197,
"learning_rate": 7.019678203706163e-08,
"loss": 0.2506,
"step": 232
},
{
"epoch": 4.452229299363057,
"grad_norm": 0.31293925272597667,
"learning_rate": 6.928109952333506e-08,
"loss": 0.2545,
"step": 233
},
{
"epoch": 4.471337579617835,
"grad_norm": 0.31681568074468586,
"learning_rate": 6.836825235889018e-08,
"loss": 0.2566,
"step": 234
},
{
"epoch": 4.490445859872612,
"grad_norm": 0.3303594370705204,
"learning_rate": 6.74583247993649e-08,
"loss": 0.2528,
"step": 235
},
{
"epoch": 4.509554140127388,
"grad_norm": 0.3117652488003975,
"learning_rate": 6.655140083091793e-08,
"loss": 0.2467,
"step": 236
},
{
"epoch": 4.528662420382165,
"grad_norm": 0.3169598110655772,
"learning_rate": 6.56475641624771e-08,
"loss": 0.2639,
"step": 237
},
{
"epoch": 4.547770700636943,
"grad_norm": 0.3243563781200814,
"learning_rate": 6.474689821801294e-08,
"loss": 0.2687,
"step": 238
},
{
"epoch": 4.56687898089172,
"grad_norm": 0.289124780632532,
"learning_rate": 6.384948612883871e-08,
"loss": 0.2603,
"step": 239
},
{
"epoch": 4.585987261146497,
"grad_norm": 0.29440728893533646,
"learning_rate": 6.29554107259374e-08,
"loss": 0.2477,
"step": 240
},
{
"epoch": 4.6050955414012735,
"grad_norm": 0.28967915725187654,
"learning_rate": 6.206475453231643e-08,
"loss": 0.2498,
"step": 241
},
{
"epoch": 4.624203821656051,
"grad_norm": 0.3078511207653716,
"learning_rate": 6.117759975539074e-08,
"loss": 0.2536,
"step": 242
},
{
"epoch": 4.643312101910828,
"grad_norm": 0.3087724727800373,
"learning_rate": 6.029402827939519e-08,
"loss": 0.2475,
"step": 243
},
{
"epoch": 4.662420382165605,
"grad_norm": 0.30886308506307386,
"learning_rate": 5.941412165782644e-08,
"loss": 0.2646,
"step": 244
},
{
"epoch": 4.681528662420382,
"grad_norm": 0.30050126959049084,
"learning_rate": 5.853796110591582e-08,
"loss": 0.2516,
"step": 245
},
{
"epoch": 4.7006369426751595,
"grad_norm": 0.2863534353646146,
"learning_rate": 5.7665627493133084e-08,
"loss": 0.254,
"step": 246
},
{
"epoch": 4.719745222929936,
"grad_norm": 0.31186833842781353,
"learning_rate": 5.6797201335722055e-08,
"loss": 0.2636,
"step": 247
},
{
"epoch": 4.738853503184713,
"grad_norm": 0.33081843113519477,
"learning_rate": 5.593276278926912e-08,
"loss": 0.2439,
"step": 248
},
{
"epoch": 4.757961783439491,
"grad_norm": 0.3034226509534758,
"learning_rate": 5.5072391641305003e-08,
"loss": 0.2547,
"step": 249
},
{
"epoch": 4.777070063694268,
"grad_norm": 0.29722750131690423,
"learning_rate": 5.4216167303939996e-08,
"loss": 0.2526,
"step": 250
},
{
"epoch": 4.796178343949045,
"grad_norm": 0.2988145357986916,
"learning_rate": 5.33641688065346e-08,
"loss": 0.2547,
"step": 251
},
{
"epoch": 4.8152866242038215,
"grad_norm": 0.30646135021295096,
"learning_rate": 5.251647478840511e-08,
"loss": 0.2484,
"step": 252
},
{
"epoch": 4.834394904458598,
"grad_norm": 0.3157144375006897,
"learning_rate": 5.167316349156494e-08,
"loss": 0.2419,
"step": 253
},
{
"epoch": 4.853503184713376,
"grad_norm": 0.2983724975662055,
"learning_rate": 5.0834312753503117e-08,
"loss": 0.2589,
"step": 254
},
{
"epoch": 4.872611464968153,
"grad_norm": 0.3102633335125285,
"learning_rate": 5.000000000000002e-08,
"loss": 0.2599,
"step": 255
},
{
"epoch": 4.89171974522293,
"grad_norm": 0.29723998911569716,
"learning_rate": 4.9170302237980564e-08,
"loss": 0.2457,
"step": 256
},
{
"epoch": 4.9108280254777075,
"grad_norm": 0.3054372275805486,
"learning_rate": 4.8345296048406856e-08,
"loss": 0.2538,
"step": 257
},
{
"epoch": 4.929936305732484,
"grad_norm": 0.30363830684403537,
"learning_rate": 4.752505757920977e-08,
"loss": 0.2486,
"step": 258
},
{
"epoch": 4.949044585987261,
"grad_norm": 0.2898474470428452,
"learning_rate": 4.6709662538260266e-08,
"loss": 0.2581,
"step": 259
},
{
"epoch": 4.968152866242038,
"grad_norm": 0.2910012884854503,
"learning_rate": 4.5899186186381725e-08,
"loss": 0.2537,
"step": 260
},
{
"epoch": 4.987261146496815,
"grad_norm": 0.3149542929423315,
"learning_rate": 4.5093703330403374e-08,
"loss": 0.2535,
"step": 261
},
{
"epoch": 5.006369426751593,
"grad_norm": 0.3175825995451886,
"learning_rate": 4.429328831625565e-08,
"loss": 0.2585,
"step": 262
},
{
"epoch": 5.025477707006369,
"grad_norm": 0.3236452474738274,
"learning_rate": 4.3498015022108e-08,
"loss": 0.2653,
"step": 263
},
{
"epoch": 5.044585987261146,
"grad_norm": 0.29626391836670807,
"learning_rate": 4.270795685155001e-08,
"loss": 0.257,
"step": 264
},
{
"epoch": 5.063694267515924,
"grad_norm": 0.3026250583365394,
"learning_rate": 4.1923186726816305e-08,
"loss": 0.2559,
"step": 265
},
{
"epoch": 5.082802547770701,
"grad_norm": 0.2832298751120326,
"learning_rate": 4.114377708205571e-08,
"loss": 0.2627,
"step": 266
},
{
"epoch": 5.101910828025478,
"grad_norm": 0.29799965486813934,
"learning_rate": 4.036979985664566e-08,
"loss": 0.2494,
"step": 267
},
{
"epoch": 5.1210191082802545,
"grad_norm": 0.30594749582433756,
"learning_rate": 3.9601326488552255e-08,
"loss": 0.258,
"step": 268
},
{
"epoch": 5.140127388535032,
"grad_norm": 0.2762446500911052,
"learning_rate": 3.883842790773647e-08,
"loss": 0.2427,
"step": 269
},
{
"epoch": 5.159235668789809,
"grad_norm": 0.30019745369817896,
"learning_rate": 3.808117452960734e-08,
"loss": 0.2547,
"step": 270
},
{
"epoch": 5.178343949044586,
"grad_norm": 0.29753102050585134,
"learning_rate": 3.732963624852274e-08,
"loss": 0.2535,
"step": 271
},
{
"epoch": 5.197452229299363,
"grad_norm": 0.2935625407698436,
"learning_rate": 3.658388243133804e-08,
"loss": 0.2587,
"step": 272
},
{
"epoch": 5.2165605095541405,
"grad_norm": 0.3209655263508771,
"learning_rate": 3.584398191100341e-08,
"loss": 0.2452,
"step": 273
},
{
"epoch": 5.235668789808917,
"grad_norm": 0.31820460210115087,
"learning_rate": 3.5110002980210973e-08,
"loss": 0.2432,
"step": 274
},
{
"epoch": 5.254777070063694,
"grad_norm": 0.2992144995803717,
"learning_rate": 3.438201338509098e-08,
"loss": 0.2431,
"step": 275
},
{
"epoch": 5.273885350318471,
"grad_norm": 0.31045639532084096,
"learning_rate": 3.366008031895904e-08,
"loss": 0.2545,
"step": 276
},
{
"epoch": 5.292993630573249,
"grad_norm": 0.29354822656777496,
"learning_rate": 3.294427041611425e-08,
"loss": 0.2396,
"step": 277
},
{
"epoch": 5.312101910828026,
"grad_norm": 0.30124303336940966,
"learning_rate": 3.223464974568874e-08,
"loss": 0.2477,
"step": 278
},
{
"epoch": 5.3312101910828025,
"grad_norm": 0.30605544638967863,
"learning_rate": 3.15312838055494e-08,
"loss": 0.2591,
"step": 279
},
{
"epoch": 5.350318471337579,
"grad_norm": 0.31776982294402606,
"learning_rate": 3.083423751625281e-08,
"loss": 0.2515,
"step": 280
},
{
"epoch": 5.369426751592357,
"grad_norm": 0.30233438292813775,
"learning_rate": 3.014357521505273e-08,
"loss": 0.2609,
"step": 281
},
{
"epoch": 5.388535031847134,
"grad_norm": 0.29933329063857167,
"learning_rate": 2.9459360649961896e-08,
"loss": 0.2378,
"step": 282
},
{
"epoch": 5.407643312101911,
"grad_norm": 0.3093582978729572,
"learning_rate": 2.878165697386812e-08,
"loss": 0.2542,
"step": 283
},
{
"epoch": 5.426751592356688,
"grad_norm": 0.2992731993684574,
"learning_rate": 2.811052673870534e-08,
"loss": 0.2411,
"step": 284
},
{
"epoch": 5.445859872611465,
"grad_norm": 0.3240374627908033,
"learning_rate": 2.7446031889679888e-08,
"loss": 0.2483,
"step": 285
},
{
"epoch": 5.464968152866242,
"grad_norm": 0.3012591950583537,
"learning_rate": 2.6788233759553138e-08,
"loss": 0.2594,
"step": 286
},
{
"epoch": 5.484076433121019,
"grad_norm": 0.3232535802137296,
"learning_rate": 2.61371930629805e-08,
"loss": 0.2424,
"step": 287
},
{
"epoch": 5.503184713375796,
"grad_norm": 0.3024568075598484,
"learning_rate": 2.549296989090738e-08,
"loss": 0.2637,
"step": 288
},
{
"epoch": 5.522292993630574,
"grad_norm": 0.2862988052808537,
"learning_rate": 2.4855623705022788e-08,
"loss": 0.2656,
"step": 289
},
{
"epoch": 5.54140127388535,
"grad_norm": 0.30274335948775416,
"learning_rate": 2.4225213332271198e-08,
"loss": 0.2625,
"step": 290
},
{
"epoch": 5.560509554140127,
"grad_norm": 0.30372044645832685,
"learning_rate": 2.3601796959422582e-08,
"loss": 0.2534,
"step": 291
},
{
"epoch": 5.579617834394904,
"grad_norm": 0.2932251760911936,
"learning_rate": 2.2985432127701942e-08,
"loss": 0.2609,
"step": 292
},
{
"epoch": 5.598726114649682,
"grad_norm": 0.3003519183393173,
"learning_rate": 2.237617572747834e-08,
"loss": 0.2586,
"step": 293
},
{
"epoch": 5.617834394904459,
"grad_norm": 0.29295961875583704,
"learning_rate": 2.1774083993013716e-08,
"loss": 0.2514,
"step": 294
},
{
"epoch": 5.6369426751592355,
"grad_norm": 0.30284063679621004,
"learning_rate": 2.117921249727258e-08,
"loss": 0.2517,
"step": 295
},
{
"epoch": 5.656050955414012,
"grad_norm": 0.2944923570429743,
"learning_rate": 2.0591616146792702e-08,
"loss": 0.2571,
"step": 296
},
{
"epoch": 5.67515923566879,
"grad_norm": 0.2926540183721099,
"learning_rate": 2.001134917661713e-08,
"loss": 0.2699,
"step": 297
},
{
"epoch": 5.694267515923567,
"grad_norm": 0.28358364929517027,
"learning_rate": 1.9438465145288373e-08,
"loss": 0.2607,
"step": 298
},
{
"epoch": 5.713375796178344,
"grad_norm": 0.3058440366761672,
"learning_rate": 1.8873016929904938e-08,
"loss": 0.2545,
"step": 299
},
{
"epoch": 5.732484076433121,
"grad_norm": 0.2956033323033416,
"learning_rate": 1.831505672124083e-08,
"loss": 0.2441,
"step": 300
},
{
"epoch": 5.751592356687898,
"grad_norm": 0.3260543878256,
"learning_rate": 1.776463601892825e-08,
"loss": 0.2498,
"step": 301
},
{
"epoch": 5.770700636942675,
"grad_norm": 0.2815433176272766,
"learning_rate": 1.7221805626704277e-08,
"loss": 0.2561,
"step": 302
},
{
"epoch": 5.789808917197452,
"grad_norm": 0.32147374978023563,
"learning_rate": 1.6686615647721637e-08,
"loss": 0.2507,
"step": 303
},
{
"epoch": 5.80891719745223,
"grad_norm": 0.3136030754385911,
"learning_rate": 1.615911547992426e-08,
"loss": 0.2591,
"step": 304
},
{
"epoch": 5.828025477707007,
"grad_norm": 0.2925892696690556,
"learning_rate": 1.5639353811487744e-08,
"loss": 0.2487,
"step": 305
},
{
"epoch": 5.8471337579617835,
"grad_norm": 0.30292693895644507,
"learning_rate": 1.5127378616325602e-08,
"loss": 0.2514,
"step": 306
},
{
"epoch": 5.86624203821656,
"grad_norm": 0.31710406869483737,
"learning_rate": 1.4623237149661139e-08,
"loss": 0.2629,
"step": 307
},
{
"epoch": 5.885350318471337,
"grad_norm": 0.30143729522096213,
"learning_rate": 1.4126975943665842e-08,
"loss": 0.2388,
"step": 308
},
{
"epoch": 5.904458598726115,
"grad_norm": 0.2928856492144717,
"learning_rate": 1.3638640803164514e-08,
"loss": 0.2591,
"step": 309
},
{
"epoch": 5.923566878980892,
"grad_norm": 0.30225338228365756,
"learning_rate": 1.3158276801407431e-08,
"loss": 0.2549,
"step": 310
},
{
"epoch": 5.942675159235669,
"grad_norm": 0.31999882843186456,
"learning_rate": 1.268592827591014e-08,
"loss": 0.2552,
"step": 311
},
{
"epoch": 5.961783439490446,
"grad_norm": 0.3116709540538741,
"learning_rate": 1.2221638824361069e-08,
"loss": 0.2561,
"step": 312
},
{
"epoch": 5.980891719745223,
"grad_norm": 0.2852514751876125,
"learning_rate": 1.1765451300597573e-08,
"loss": 0.2494,
"step": 313
},
{
"epoch": 6.0,
"grad_norm": 0.30127697140383475,
"learning_rate": 1.131740781065037e-08,
"loss": 0.2677,
"step": 314
},
{
"epoch": 6.019108280254777,
"grad_norm": 0.299470849511753,
"learning_rate": 1.0877549708857225e-08,
"loss": 0.2492,
"step": 315
},
{
"epoch": 6.038216560509555,
"grad_norm": 0.29525030906520566,
"learning_rate": 1.0445917594046071e-08,
"loss": 0.2573,
"step": 316
},
{
"epoch": 6.057324840764331,
"grad_norm": 0.309895318768612,
"learning_rate": 1.0022551305787563e-08,
"loss": 0.2478,
"step": 317
},
{
"epoch": 6.076433121019108,
"grad_norm": 0.3116428679714083,
"learning_rate": 9.607489920717981e-09,
"loss": 0.2616,
"step": 318
},
{
"epoch": 6.095541401273885,
"grad_norm": 0.28485740043622854,
"learning_rate": 9.200771748932512e-09,
"loss": 0.23,
"step": 319
},
{
"epoch": 6.114649681528663,
"grad_norm": 0.30867096345949346,
"learning_rate": 8.802434330449127e-09,
"loss": 0.2423,
"step": 320
},
{
"epoch": 6.13375796178344,
"grad_norm": 0.29735720879032396,
"learning_rate": 8.412514431743656e-09,
"loss": 0.2506,
"step": 321
},
{
"epoch": 6.1528662420382165,
"grad_norm": 0.296939152603047,
"learning_rate": 8.031048042356392e-09,
"loss": 0.2518,
"step": 322
},
{
"epoch": 6.171974522292993,
"grad_norm": 0.3047288976089965,
"learning_rate": 7.65807037157007e-09,
"loss": 0.2571,
"step": 323
},
{
"epoch": 6.191082802547771,
"grad_norm": 0.30499824231597183,
"learning_rate": 7.293615845160195e-09,
"loss": 0.2492,
"step": 324
},
{
"epoch": 6.210191082802548,
"grad_norm": 0.28149771596925177,
"learning_rate": 6.9377181022174604e-09,
"loss": 0.2486,
"step": 325
},
{
"epoch": 6.229299363057325,
"grad_norm": 0.29778166813478346,
"learning_rate": 6.590409992042956e-09,
"loss": 0.253,
"step": 326
},
{
"epoch": 6.248407643312102,
"grad_norm": 0.3094630597200761,
"learning_rate": 6.25172357111603e-09,
"loss": 0.2552,
"step": 327
},
{
"epoch": 6.267515923566879,
"grad_norm": 0.3190922374282261,
"learning_rate": 5.921690100135712e-09,
"loss": 0.2585,
"step": 328
},
{
"epoch": 6.286624203821656,
"grad_norm": 0.3193448427368392,
"learning_rate": 5.600340041135132e-09,
"loss": 0.2566,
"step": 329
},
{
"epoch": 6.305732484076433,
"grad_norm": 0.30874010954068026,
"learning_rate": 5.2877030546700115e-09,
"loss": 0.2476,
"step": 330
},
{
"epoch": 6.32484076433121,
"grad_norm": 0.29530151136694954,
"learning_rate": 4.9838079970809245e-09,
"loss": 0.2526,
"step": 331
},
{
"epoch": 6.343949044585988,
"grad_norm": 0.29201059542728436,
"learning_rate": 4.688682917829967e-09,
"loss": 0.2743,
"step": 332
},
{
"epoch": 6.3630573248407645,
"grad_norm": 0.29014727881359437,
"learning_rate": 4.402355056911655e-09,
"loss": 0.2506,
"step": 333
},
{
"epoch": 6.382165605095541,
"grad_norm": 0.30021554157449915,
"learning_rate": 4.124850842338778e-09,
"loss": 0.2658,
"step": 334
},
{
"epoch": 6.401273885350318,
"grad_norm": 0.29191739918759346,
"learning_rate": 3.856195887703095e-09,
"loss": 0.2526,
"step": 335
},
{
"epoch": 6.420382165605096,
"grad_norm": 0.2970091109257102,
"learning_rate": 3.5964149898111585e-09,
"loss": 0.2515,
"step": 336
},
{
"epoch": 6.439490445859873,
"grad_norm": 0.301615837755627,
"learning_rate": 3.345532126395578e-09,
"loss": 0.2525,
"step": 337
},
{
"epoch": 6.45859872611465,
"grad_norm": 0.29483374535498386,
"learning_rate": 3.103570453901938e-09,
"loss": 0.2518,
"step": 338
},
{
"epoch": 6.477707006369426,
"grad_norm": 0.31193863122862164,
"learning_rate": 2.8705523053513814e-09,
"loss": 0.2581,
"step": 339
},
{
"epoch": 6.496815286624204,
"grad_norm": 0.3102645257081597,
"learning_rate": 2.6464991882793277e-09,
"loss": 0.2596,
"step": 340
},
{
"epoch": 6.515923566878981,
"grad_norm": 0.29720330855238497,
"learning_rate": 2.4314317827503373e-09,
"loss": 0.2482,
"step": 341
},
{
"epoch": 6.535031847133758,
"grad_norm": 0.30554779244411534,
"learning_rate": 2.2253699394493065e-09,
"loss": 0.2495,
"step": 342
},
{
"epoch": 6.554140127388535,
"grad_norm": 0.3132780643873276,
"learning_rate": 2.0283326778492536e-09,
"loss": 0.2445,
"step": 343
},
{
"epoch": 6.573248407643312,
"grad_norm": 0.2930656726881392,
"learning_rate": 1.8403381844558808e-09,
"loss": 0.2538,
"step": 344
},
{
"epoch": 6.592356687898089,
"grad_norm": 0.29674947300302884,
"learning_rate": 1.661403811128903e-09,
"loss": 0.2668,
"step": 345
},
{
"epoch": 6.611464968152866,
"grad_norm": 0.3075402793564451,
"learning_rate": 1.4915460734805096e-09,
"loss": 0.269,
"step": 346
},
{
"epoch": 6.630573248407643,
"grad_norm": 0.28966161314222383,
"learning_rate": 1.3307806493509377e-09,
"loss": 0.247,
"step": 347
},
{
"epoch": 6.649681528662421,
"grad_norm": 0.31994849636539213,
"learning_rate": 1.1791223773614634e-09,
"loss": 0.2594,
"step": 348
},
{
"epoch": 6.6687898089171975,
"grad_norm": 0.3026013256083666,
"learning_rate": 1.036585255544764e-09,
"loss": 0.2638,
"step": 349
},
{
"epoch": 6.687898089171974,
"grad_norm": 0.3132269351064342,
"learning_rate": 9.031824400528854e-10,
"loss": 0.2528,
"step": 350
},
{
"epoch": 6.707006369426751,
"grad_norm": 0.2882624273113626,
"learning_rate": 7.789262439430012e-10,
"loss": 0.2469,
"step": 351
},
{
"epoch": 6.726114649681529,
"grad_norm": 0.3064618613259807,
"learning_rate": 6.638281360408338e-10,
"loss": 0.2574,
"step": 352
},
{
"epoch": 6.745222929936306,
"grad_norm": 0.30382975584991506,
"learning_rate": 5.578987398821344e-10,
"loss": 0.2493,
"step": 353
},
{
"epoch": 6.764331210191083,
"grad_norm": 0.2902011016409033,
"learning_rate": 4.611478327321339e-10,
"loss": 0.2605,
"step": 354
},
{
"epoch": 6.7834394904458595,
"grad_norm": 0.29282370941155056,
"learning_rate": 3.735843446830866e-10,
"loss": 0.2531,
"step": 355
},
{
"epoch": 6.802547770700637,
"grad_norm": 0.3078439684683759,
"learning_rate": 2.952163578300193e-10,
"loss": 0.2473,
"step": 356
},
{
"epoch": 6.821656050955414,
"grad_norm": 0.3087832606093336,
"learning_rate": 2.2605110552477157e-10,
"loss": 0.2672,
"step": 357
},
{
"epoch": 6.840764331210191,
"grad_norm": 0.3098671142360526,
"learning_rate": 1.6609497170834154e-10,
"loss": 0.2569,
"step": 358
},
{
"epoch": 6.859872611464969,
"grad_norm": 0.29429296975358027,
"learning_rate": 1.1535349032167907e-10,
"loss": 0.2546,
"step": 359
},
{
"epoch": 6.8789808917197455,
"grad_norm": 0.31345754430512796,
"learning_rate": 7.38313447948724e-11,
"loss": 0.2639,
"step": 360
},
{
"epoch": 6.898089171974522,
"grad_norm": 0.2921055018298526,
"learning_rate": 4.153236761488266e-11,
"loss": 0.2612,
"step": 361
},
{
"epoch": 6.917197452229299,
"grad_norm": 0.28576822025063914,
"learning_rate": 1.8459539971804605e-11,
"loss": 0.2473,
"step": 362
},
{
"epoch": 6.936305732484076,
"grad_norm": 0.2826191180491914,
"learning_rate": 4.614991483686825e-12,
"loss": 0.2395,
"step": 363
},
{
"epoch": 6.955414012738854,
"grad_norm": 0.33090612760402355,
"learning_rate": 0.0,
"loss": 0.2531,
"step": 364
},
{
"epoch": 6.955414012738854,
"step": 364,
"total_flos": 3.947982283988664e+17,
"train_loss": 0.25456836733680505,
"train_runtime": 5658.0504,
"train_samples_per_second": 6.186,
"train_steps_per_second": 0.064
}
],
"logging_steps": 1,
"max_steps": 364,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.947982283988664e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}