Qwen7_warmup500_em3 / trainer_state.json
mangopy's picture
Upload trainer_state.json with huggingface_hub
5b871e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9968,
"eval_steps": 500,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0128,
"grad_norm": 3.4233698136439568,
"learning_rate": 1.25e-07,
"loss": 0.0246,
"step": 1
},
{
"epoch": 0.0256,
"grad_norm": 3.728998587328461,
"learning_rate": 2.5e-07,
"loss": 0.0252,
"step": 2
},
{
"epoch": 0.0384,
"grad_norm": 3.0616613124293135,
"learning_rate": 3.75e-07,
"loss": 0.0227,
"step": 3
},
{
"epoch": 0.0512,
"grad_norm": 2.3123184172566016,
"learning_rate": 5e-07,
"loss": 0.0168,
"step": 4
},
{
"epoch": 0.064,
"grad_norm": 3.360264502123766,
"learning_rate": 6.249999999999999e-07,
"loss": 0.021,
"step": 5
},
{
"epoch": 0.0768,
"grad_norm": 3.3460855292395757,
"learning_rate": 7.5e-07,
"loss": 0.0229,
"step": 6
},
{
"epoch": 0.0896,
"grad_norm": 3.1072974219219085,
"learning_rate": 8.75e-07,
"loss": 0.0226,
"step": 7
},
{
"epoch": 0.1024,
"grad_norm": 4.56578073058385,
"learning_rate": 1e-06,
"loss": 0.0296,
"step": 8
},
{
"epoch": 0.1152,
"grad_norm": 3.4123791670336443,
"learning_rate": 1.125e-06,
"loss": 0.0234,
"step": 9
},
{
"epoch": 0.128,
"grad_norm": 2.7894681328326816,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.0203,
"step": 10
},
{
"epoch": 0.1408,
"grad_norm": 4.6455794479831685,
"learning_rate": 1.375e-06,
"loss": 0.0299,
"step": 11
},
{
"epoch": 0.1536,
"grad_norm": 3.8109983639167577,
"learning_rate": 1.5e-06,
"loss": 0.025,
"step": 12
},
{
"epoch": 0.1664,
"grad_norm": 4.183418083336812,
"learning_rate": 1.625e-06,
"loss": 0.0311,
"step": 13
},
{
"epoch": 0.1792,
"grad_norm": 4.024058901580512,
"learning_rate": 1.75e-06,
"loss": 0.024,
"step": 14
},
{
"epoch": 0.192,
"grad_norm": 2.9842133290060593,
"learning_rate": 1.8749999999999998e-06,
"loss": 0.0198,
"step": 15
},
{
"epoch": 0.2048,
"grad_norm": 4.060055578632782,
"learning_rate": 2e-06,
"loss": 0.0272,
"step": 16
},
{
"epoch": 0.2176,
"grad_norm": 4.646550393359002,
"learning_rate": 1.9997482349425066e-06,
"loss": 0.0216,
"step": 17
},
{
"epoch": 0.2304,
"grad_norm": 3.9839364783795164,
"learning_rate": 1.9989930665413145e-06,
"loss": 0.0211,
"step": 18
},
{
"epoch": 0.2432,
"grad_norm": 4.2706791731528435,
"learning_rate": 1.997734875046456e-06,
"loss": 0.0275,
"step": 19
},
{
"epoch": 0.256,
"grad_norm": 4.46746121804618,
"learning_rate": 1.995974293995239e-06,
"loss": 0.0258,
"step": 20
},
{
"epoch": 0.2688,
"grad_norm": 4.703579744776647,
"learning_rate": 1.9937122098932426e-06,
"loss": 0.0273,
"step": 21
},
{
"epoch": 0.2816,
"grad_norm": 5.162187031521371,
"learning_rate": 1.9909497617679347e-06,
"loss": 0.0297,
"step": 22
},
{
"epoch": 0.2944,
"grad_norm": 4.08147747350908,
"learning_rate": 1.9876883405951377e-06,
"loss": 0.0241,
"step": 23
},
{
"epoch": 0.3072,
"grad_norm": 5.338597185310122,
"learning_rate": 1.9839295885986295e-06,
"loss": 0.0313,
"step": 24
},
{
"epoch": 0.32,
"grad_norm": 4.004331157501513,
"learning_rate": 1.9796753984232355e-06,
"loss": 0.0233,
"step": 25
},
{
"epoch": 0.3328,
"grad_norm": 5.79846158935698,
"learning_rate": 1.9749279121818236e-06,
"loss": 0.0328,
"step": 26
},
{
"epoch": 0.3456,
"grad_norm": 6.927108105836598,
"learning_rate": 1.9696895203766866e-06,
"loss": 0.0381,
"step": 27
},
{
"epoch": 0.3584,
"grad_norm": 5.251309102448725,
"learning_rate": 1.9639628606958534e-06,
"loss": 0.0282,
"step": 28
},
{
"epoch": 0.3712,
"grad_norm": 5.050171176886125,
"learning_rate": 1.9577508166849303e-06,
"loss": 0.0246,
"step": 29
},
{
"epoch": 0.384,
"grad_norm": 5.512551765259008,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.03,
"step": 30
},
{
"epoch": 0.3968,
"grad_norm": 5.7365196994206,
"learning_rate": 1.9438833303083674e-06,
"loss": 0.0314,
"step": 31
},
{
"epoch": 0.4096,
"grad_norm": 5.220122317364698,
"learning_rate": 1.936234870639737e-06,
"loss": 0.0311,
"step": 32
},
{
"epoch": 0.4224,
"grad_norm": 5.090432583516014,
"learning_rate": 1.928114988519039e-06,
"loss": 0.0289,
"step": 33
},
{
"epoch": 0.4352,
"grad_norm": 5.145680195739282,
"learning_rate": 1.9195277725514506e-06,
"loss": 0.0272,
"step": 34
},
{
"epoch": 0.448,
"grad_norm": 5.308627403286571,
"learning_rate": 1.9104775466588157e-06,
"loss": 0.0324,
"step": 35
},
{
"epoch": 0.4608,
"grad_norm": 6.22201023848965,
"learning_rate": 1.9009688679024189e-06,
"loss": 0.0344,
"step": 36
},
{
"epoch": 0.4736,
"grad_norm": 5.003597737244695,
"learning_rate": 1.8910065241883678e-06,
"loss": 0.0333,
"step": 37
},
{
"epoch": 0.4864,
"grad_norm": 5.6592651331248,
"learning_rate": 1.8805955318567379e-06,
"loss": 0.0315,
"step": 38
},
{
"epoch": 0.4992,
"grad_norm": 5.975923701038477,
"learning_rate": 1.8697411331556953e-06,
"loss": 0.0241,
"step": 39
},
{
"epoch": 0.512,
"grad_norm": 5.750552226599778,
"learning_rate": 1.858448793601866e-06,
"loss": 0.0329,
"step": 40
},
{
"epoch": 0.5248,
"grad_norm": 5.816663494605659,
"learning_rate": 1.8467241992282841e-06,
"loss": 0.0337,
"step": 41
},
{
"epoch": 0.5376,
"grad_norm": 5.060423336355904,
"learning_rate": 1.8345732537213026e-06,
"loss": 0.0289,
"step": 42
},
{
"epoch": 0.5504,
"grad_norm": 5.111705069882343,
"learning_rate": 1.82200207544791e-06,
"loss": 0.0253,
"step": 43
},
{
"epoch": 0.5632,
"grad_norm": 4.76340608246537,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.0242,
"step": 44
},
{
"epoch": 0.576,
"grad_norm": 5.232632523840601,
"learning_rate": 1.795624548881781e-06,
"loss": 0.0332,
"step": 45
},
{
"epoch": 0.5888,
"grad_norm": 5.063499559835732,
"learning_rate": 1.7818314824680298e-06,
"loss": 0.0331,
"step": 46
},
{
"epoch": 0.6016,
"grad_norm": 5.081572984551496,
"learning_rate": 1.767644740358011e-06,
"loss": 0.0353,
"step": 47
},
{
"epoch": 0.6144,
"grad_norm": 4.256457044525209,
"learning_rate": 1.753071466003611e-06,
"loss": 0.0275,
"step": 48
},
{
"epoch": 0.6272,
"grad_norm": 5.0457530324965925,
"learning_rate": 1.7381189974873407e-06,
"loss": 0.0345,
"step": 49
},
{
"epoch": 0.64,
"grad_norm": 4.222996253822678,
"learning_rate": 1.7227948638273915e-06,
"loss": 0.0258,
"step": 50
},
{
"epoch": 0.6528,
"grad_norm": 5.105453296008258,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.0361,
"step": 51
},
{
"epoch": 0.6656,
"grad_norm": 4.7238042861158505,
"learning_rate": 1.6910626489868648e-06,
"loss": 0.03,
"step": 52
},
{
"epoch": 0.6784,
"grad_norm": 4.467100915377624,
"learning_rate": 1.6746705459320744e-06,
"loss": 0.0301,
"step": 53
},
{
"epoch": 0.6912,
"grad_norm": 4.199798840654239,
"learning_rate": 1.6579387259397126e-06,
"loss": 0.0272,
"step": 54
},
{
"epoch": 0.704,
"grad_norm": 4.253879642031892,
"learning_rate": 1.640875613985024e-06,
"loss": 0.0263,
"step": 55
},
{
"epoch": 0.7168,
"grad_norm": 5.478200127323976,
"learning_rate": 1.6234898018587336e-06,
"loss": 0.0369,
"step": 56
},
{
"epoch": 0.7296,
"grad_norm": 5.3136786533226426,
"learning_rate": 1.6057900438408199e-06,
"loss": 0.0337,
"step": 57
},
{
"epoch": 0.7424,
"grad_norm": 5.166894330495721,
"learning_rate": 1.587785252292473e-06,
"loss": 0.0355,
"step": 58
},
{
"epoch": 0.7552,
"grad_norm": 4.375886424848137,
"learning_rate": 1.569484493168452e-06,
"loss": 0.0281,
"step": 59
},
{
"epoch": 0.768,
"grad_norm": 6.886035234115627,
"learning_rate": 1.5508969814521024e-06,
"loss": 0.0388,
"step": 60
},
{
"epoch": 0.7808,
"grad_norm": 5.791644877375871,
"learning_rate": 1.5320320765153365e-06,
"loss": 0.0373,
"step": 61
},
{
"epoch": 0.7936,
"grad_norm": 5.096680042384012,
"learning_rate": 1.5128992774059062e-06,
"loss": 0.0344,
"step": 62
},
{
"epoch": 0.8064,
"grad_norm": 5.046290484669824,
"learning_rate": 1.4935082180643467e-06,
"loss": 0.0411,
"step": 63
},
{
"epoch": 0.8192,
"grad_norm": 5.210730266900293,
"learning_rate": 1.4738686624729987e-06,
"loss": 0.0353,
"step": 64
},
{
"epoch": 0.832,
"grad_norm": 4.780860746415674,
"learning_rate": 1.4539904997395467e-06,
"loss": 0.0285,
"step": 65
},
{
"epoch": 0.8448,
"grad_norm": 4.9827723432345765,
"learning_rate": 1.433883739117558e-06,
"loss": 0.0355,
"step": 66
},
{
"epoch": 0.8576,
"grad_norm": 4.566523361618775,
"learning_rate": 1.4135585049665206e-06,
"loss": 0.0229,
"step": 67
},
{
"epoch": 0.8704,
"grad_norm": 3.9442441573671534,
"learning_rate": 1.3930250316539235e-06,
"loss": 0.0251,
"step": 68
},
{
"epoch": 0.8832,
"grad_norm": 4.996881415714366,
"learning_rate": 1.3722936584019451e-06,
"loss": 0.0361,
"step": 69
},
{
"epoch": 0.896,
"grad_norm": 5.769732263820899,
"learning_rate": 1.3513748240813427e-06,
"loss": 0.0366,
"step": 70
},
{
"epoch": 0.9088,
"grad_norm": 4.573503361877803,
"learning_rate": 1.3302790619551672e-06,
"loss": 0.0272,
"step": 71
},
{
"epoch": 0.9216,
"grad_norm": 3.296575123346509,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.023,
"step": 72
},
{
"epoch": 0.9344,
"grad_norm": 4.908588421936803,
"learning_rate": 1.2875993274320173e-06,
"loss": 0.0278,
"step": 73
},
{
"epoch": 0.9472,
"grad_norm": 3.846287378603072,
"learning_rate": 1.266036845566675e-06,
"loss": 0.0255,
"step": 74
},
{
"epoch": 0.96,
"grad_norm": 4.087209232767521,
"learning_rate": 1.244340406137894e-06,
"loss": 0.0295,
"step": 75
},
{
"epoch": 0.9728,
"grad_norm": 4.64654246748357,
"learning_rate": 1.2225209339563143e-06,
"loss": 0.0278,
"step": 76
},
{
"epoch": 0.9856,
"grad_norm": 4.507522376329423,
"learning_rate": 1.2005894157832728e-06,
"loss": 0.0319,
"step": 77
},
{
"epoch": 0.9984,
"grad_norm": 5.762678744609664,
"learning_rate": 1.1785568947986366e-06,
"loss": 0.0352,
"step": 78
},
{
"epoch": 1.0112,
"grad_norm": 3.1644394788507513,
"learning_rate": 1.156434465040231e-06,
"loss": 0.0161,
"step": 79
},
{
"epoch": 1.024,
"grad_norm": 3.7447721631178,
"learning_rate": 1.1342332658176555e-06,
"loss": 0.0191,
"step": 80
},
{
"epoch": 1.0368,
"grad_norm": 3.2322255457531637,
"learning_rate": 1.1119644761033077e-06,
"loss": 0.0124,
"step": 81
},
{
"epoch": 1.0496,
"grad_norm": 2.4196830627748285,
"learning_rate": 1.0896393089034335e-06,
"loss": 0.0105,
"step": 82
},
{
"epoch": 1.0624,
"grad_norm": 2.5920868298208872,
"learning_rate": 1.0672690056120398e-06,
"loss": 0.0144,
"step": 83
},
{
"epoch": 1.0752,
"grad_norm": 2.5821101862637175,
"learning_rate": 1.044864830350515e-06,
"loss": 0.0139,
"step": 84
},
{
"epoch": 1.088,
"grad_norm": 3.02231827873139,
"learning_rate": 1.022438064295805e-06,
"loss": 0.0135,
"step": 85
},
{
"epoch": 1.1008,
"grad_norm": 2.2923086363844845,
"learning_rate": 1e-06,
"loss": 0.0117,
"step": 86
},
{
"epoch": 1.1136,
"grad_norm": 2.8842975225041623,
"learning_rate": 9.77561935704195e-07,
"loss": 0.0115,
"step": 87
},
{
"epoch": 1.1264,
"grad_norm": 5.879985429015499,
"learning_rate": 9.551351696494853e-07,
"loss": 0.0119,
"step": 88
},
{
"epoch": 1.1392,
"grad_norm": 2.360056985536234,
"learning_rate": 9.327309943879603e-07,
"loss": 0.0096,
"step": 89
},
{
"epoch": 1.152,
"grad_norm": 1.9869435676758012,
"learning_rate": 9.103606910965665e-07,
"loss": 0.0079,
"step": 90
},
{
"epoch": 1.1648,
"grad_norm": 2.5601097067547856,
"learning_rate": 8.880355238966921e-07,
"loss": 0.0104,
"step": 91
},
{
"epoch": 1.1776,
"grad_norm": 2.7730596832345564,
"learning_rate": 8.657667341823448e-07,
"loss": 0.0118,
"step": 92
},
{
"epoch": 1.1904,
"grad_norm": 2.18163461710527,
"learning_rate": 8.435655349597689e-07,
"loss": 0.0105,
"step": 93
},
{
"epoch": 1.2032,
"grad_norm": 2.251464092159168,
"learning_rate": 8.214431052013634e-07,
"loss": 0.0148,
"step": 94
},
{
"epoch": 1.216,
"grad_norm": 2.7530295169182333,
"learning_rate": 7.994105842167272e-07,
"loss": 0.01,
"step": 95
},
{
"epoch": 1.2288000000000001,
"grad_norm": 2.526225960487079,
"learning_rate": 7.774790660436857e-07,
"loss": 0.0089,
"step": 96
},
{
"epoch": 1.2416,
"grad_norm": 2.9558648711495414,
"learning_rate": 7.556595938621058e-07,
"loss": 0.0121,
"step": 97
},
{
"epoch": 1.2544,
"grad_norm": 4.047664680116945,
"learning_rate": 7.33963154433325e-07,
"loss": 0.0122,
"step": 98
},
{
"epoch": 1.2671999999999999,
"grad_norm": 4.128501309390267,
"learning_rate": 7.124006725679828e-07,
"loss": 0.0132,
"step": 99
},
{
"epoch": 1.28,
"grad_norm": 3.158070174858742,
"learning_rate": 6.909830056250526e-07,
"loss": 0.0105,
"step": 100
},
{
"epoch": 1.2928,
"grad_norm": 2.330265912872272,
"learning_rate": 6.697209380448332e-07,
"loss": 0.0101,
"step": 101
},
{
"epoch": 1.3056,
"grad_norm": 4.9757360072405445,
"learning_rate": 6.486251759186572e-07,
"loss": 0.0179,
"step": 102
},
{
"epoch": 1.3184,
"grad_norm": 3.8743181605977743,
"learning_rate": 6.277063415980548e-07,
"loss": 0.0129,
"step": 103
},
{
"epoch": 1.3312,
"grad_norm": 3.263723675313523,
"learning_rate": 6.069749683460764e-07,
"loss": 0.0111,
"step": 104
},
{
"epoch": 1.3439999999999999,
"grad_norm": 3.1218175870205584,
"learning_rate": 5.864414950334795e-07,
"loss": 0.0119,
"step": 105
},
{
"epoch": 1.3568,
"grad_norm": 3.717962785205817,
"learning_rate": 5.661162608824419e-07,
"loss": 0.0115,
"step": 106
},
{
"epoch": 1.3696,
"grad_norm": 3.650556269715187,
"learning_rate": 5.460095002604532e-07,
"loss": 0.0123,
"step": 107
},
{
"epoch": 1.3824,
"grad_norm": 3.2197493950580296,
"learning_rate": 5.261313375270013e-07,
"loss": 0.0137,
"step": 108
},
{
"epoch": 1.3952,
"grad_norm": 3.365111064634147,
"learning_rate": 5.064917819356531e-07,
"loss": 0.0111,
"step": 109
},
{
"epoch": 1.408,
"grad_norm": 4.710390460257863,
"learning_rate": 4.871007225940939e-07,
"loss": 0.0129,
"step": 110
},
{
"epoch": 1.4208,
"grad_norm": 2.76927802183368,
"learning_rate": 4.6796792348466353e-07,
"loss": 0.013,
"step": 111
},
{
"epoch": 1.4336,
"grad_norm": 3.2171761582689915,
"learning_rate": 4.4910301854789755e-07,
"loss": 0.0114,
"step": 112
},
{
"epoch": 1.4464000000000001,
"grad_norm": 2.7947744875678096,
"learning_rate": 4.3051550683154804e-07,
"loss": 0.0113,
"step": 113
},
{
"epoch": 1.4592,
"grad_norm": 2.4585140708787043,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.0103,
"step": 114
},
{
"epoch": 1.472,
"grad_norm": 2.8113536653109965,
"learning_rate": 3.942099561591802e-07,
"loss": 0.0106,
"step": 115
},
{
"epoch": 1.4848,
"grad_norm": 3.6398945452240055,
"learning_rate": 3.765101981412665e-07,
"loss": 0.0127,
"step": 116
},
{
"epoch": 1.4976,
"grad_norm": 2.9485443643029607,
"learning_rate": 3.5912438601497584e-07,
"loss": 0.009,
"step": 117
},
{
"epoch": 1.5104,
"grad_norm": 2.9984190637681096,
"learning_rate": 3.420612740602874e-07,
"loss": 0.0093,
"step": 118
},
{
"epoch": 1.5232,
"grad_norm": 2.8046736646132744,
"learning_rate": 3.253294540679257e-07,
"loss": 0.0094,
"step": 119
},
{
"epoch": 1.536,
"grad_norm": 2.9182942187963605,
"learning_rate": 3.0893735101313535e-07,
"loss": 0.0101,
"step": 120
},
{
"epoch": 1.5488,
"grad_norm": 4.061738080588852,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.0148,
"step": 121
},
{
"epoch": 1.5615999999999999,
"grad_norm": 2.6181262527250064,
"learning_rate": 2.7720513617260855e-07,
"loss": 0.0108,
"step": 122
},
{
"epoch": 1.5744,
"grad_norm": 2.814162128761838,
"learning_rate": 2.6188100251265943e-07,
"loss": 0.0085,
"step": 123
},
{
"epoch": 1.5872000000000002,
"grad_norm": 2.978469400791401,
"learning_rate": 2.4692853399638913e-07,
"loss": 0.0123,
"step": 124
},
{
"epoch": 1.6,
"grad_norm": 1.9752826330171183,
"learning_rate": 2.3235525964198888e-07,
"loss": 0.0091,
"step": 125
},
{
"epoch": 1.6128,
"grad_norm": 2.6883643677418623,
"learning_rate": 2.181685175319702e-07,
"loss": 0.0097,
"step": 126
},
{
"epoch": 1.6256,
"grad_norm": 2.9489506229688884,
"learning_rate": 2.043754511182191e-07,
"loss": 0.0079,
"step": 127
},
{
"epoch": 1.6383999999999999,
"grad_norm": 1.9593044592383062,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.0081,
"step": 128
},
{
"epoch": 1.6512,
"grad_norm": 1.952957689869861,
"learning_rate": 1.7799792455209016e-07,
"loss": 0.0082,
"step": 129
},
{
"epoch": 1.6640000000000001,
"grad_norm": 2.38161666441156,
"learning_rate": 1.6542674627869734e-07,
"loss": 0.0094,
"step": 130
},
{
"epoch": 1.6768,
"grad_norm": 2.164377402243927,
"learning_rate": 1.5327580077171588e-07,
"loss": 0.0097,
"step": 131
},
{
"epoch": 1.6896,
"grad_norm": 3.518297727779028,
"learning_rate": 1.415512063981339e-07,
"loss": 0.0089,
"step": 132
},
{
"epoch": 1.7024,
"grad_norm": 3.697074996731219,
"learning_rate": 1.3025886684430465e-07,
"loss": 0.0078,
"step": 133
},
{
"epoch": 1.7151999999999998,
"grad_norm": 4.314207988612093,
"learning_rate": 1.19404468143262e-07,
"loss": 0.0119,
"step": 134
},
{
"epoch": 1.728,
"grad_norm": 2.9026476752647414,
"learning_rate": 1.089934758116322e-07,
"loss": 0.0138,
"step": 135
},
{
"epoch": 1.7408000000000001,
"grad_norm": 2.502384977722473,
"learning_rate": 9.903113209758096e-08,
"loss": 0.0112,
"step": 136
},
{
"epoch": 1.7536,
"grad_norm": 1.7807221172577514,
"learning_rate": 8.952245334118413e-08,
"loss": 0.0077,
"step": 137
},
{
"epoch": 1.7664,
"grad_norm": 2.727436513377534,
"learning_rate": 8.047222744854942e-08,
"loss": 0.0096,
"step": 138
},
{
"epoch": 1.7792,
"grad_norm": 1.6157555666433816,
"learning_rate": 7.188501148096116e-08,
"loss": 0.007,
"step": 139
},
{
"epoch": 1.792,
"grad_norm": 1.5949515973033697,
"learning_rate": 6.376512936026279e-08,
"loss": 0.0062,
"step": 140
},
{
"epoch": 1.8048,
"grad_norm": 2.6399637194756718,
"learning_rate": 5.611666969163242e-08,
"loss": 0.0095,
"step": 141
},
{
"epoch": 1.8176,
"grad_norm": 3.0621545226623907,
"learning_rate": 4.8943483704846465e-08,
"loss": 0.0132,
"step": 142
},
{
"epoch": 1.8304,
"grad_norm": 1.9600404784878551,
"learning_rate": 4.224918331506955e-08,
"loss": 0.0095,
"step": 143
},
{
"epoch": 1.8432,
"grad_norm": 1.963746557604649,
"learning_rate": 3.6037139304146756e-08,
"loss": 0.0099,
"step": 144
},
{
"epoch": 1.8559999999999999,
"grad_norm": 4.527295989762448,
"learning_rate": 3.0310479623313125e-08,
"loss": 0.0144,
"step": 145
},
{
"epoch": 1.8688,
"grad_norm": 2.6834067438548166,
"learning_rate": 2.507208781817638e-08,
"loss": 0.012,
"step": 146
},
{
"epoch": 1.8816000000000002,
"grad_norm": 1.8417342752412211,
"learning_rate": 2.032460157676452e-08,
"loss": 0.0077,
"step": 147
},
{
"epoch": 1.8944,
"grad_norm": 2.99373797619176,
"learning_rate": 1.607041140137033e-08,
"loss": 0.0115,
"step": 148
},
{
"epoch": 1.9072,
"grad_norm": 1.1214911188004222,
"learning_rate": 1.231165940486234e-08,
"loss": 0.0062,
"step": 149
},
{
"epoch": 1.92,
"grad_norm": 2.116183152950272,
"learning_rate": 9.050238232065299e-09,
"loss": 0.0097,
"step": 150
},
{
"epoch": 1.9327999999999999,
"grad_norm": 1.8359832453089462,
"learning_rate": 6.2877901067573955e-09,
"loss": 0.0069,
"step": 151
},
{
"epoch": 1.9456,
"grad_norm": 2.0445129237685773,
"learning_rate": 4.025706004760931e-09,
"loss": 0.0086,
"step": 152
},
{
"epoch": 1.9584000000000001,
"grad_norm": 1.4095794512624829,
"learning_rate": 2.2651249535439177e-09,
"loss": 0.0053,
"step": 153
},
{
"epoch": 1.9712,
"grad_norm": 1.4980650303315703,
"learning_rate": 1.0069334586854105e-09,
"loss": 0.0068,
"step": 154
},
{
"epoch": 1.984,
"grad_norm": 1.6817715164958336,
"learning_rate": 2.517650574934693e-10,
"loss": 0.0087,
"step": 155
},
{
"epoch": 1.9968,
"grad_norm": 5.327422033689792,
"learning_rate": 0.0,
"loss": 0.0163,
"step": 156
},
{
"epoch": 1.9968,
"step": 156,
"total_flos": 138561371045888.0,
"train_loss": 0.019949143110678937,
"train_runtime": 6111.638,
"train_samples_per_second": 6.545,
"train_steps_per_second": 0.026
}
],
"logging_steps": 1,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 138561371045888.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}