qwen25_14ft / trainer_state.json
akhooli's picture
Upload folder using huggingface_hub
acb9ddb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21416142417347075,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010708071208673537,
"grad_norm": 3.1792166233062744,
"learning_rate": 5e-05,
"loss": 2.9696,
"step": 1
},
{
"epoch": 0.0021416142417347074,
"grad_norm": 3.353651285171509,
"learning_rate": 0.0001,
"loss": 3.0758,
"step": 2
},
{
"epoch": 0.0032124213626020613,
"grad_norm": 1.886090874671936,
"learning_rate": 0.00015,
"loss": 2.7586,
"step": 3
},
{
"epoch": 0.004283228483469415,
"grad_norm": 1.451682686805725,
"learning_rate": 0.0002,
"loss": 2.5324,
"step": 4
},
{
"epoch": 0.005354035604336769,
"grad_norm": 1.175742268562317,
"learning_rate": 0.00025,
"loss": 2.2141,
"step": 5
},
{
"epoch": 0.0064248427252041225,
"grad_norm": 0.868193507194519,
"learning_rate": 0.0003,
"loss": 2.0835,
"step": 6
},
{
"epoch": 0.007495649846071476,
"grad_norm": 1.0772305727005005,
"learning_rate": 0.00035,
"loss": 1.9921,
"step": 7
},
{
"epoch": 0.00856645696693883,
"grad_norm": 1.069272518157959,
"learning_rate": 0.0004,
"loss": 1.9016,
"step": 8
},
{
"epoch": 0.009637264087806184,
"grad_norm": 0.7301461100578308,
"learning_rate": 0.00045000000000000004,
"loss": 1.8262,
"step": 9
},
{
"epoch": 0.010708071208673538,
"grad_norm": 0.49968260526657104,
"learning_rate": 0.0005,
"loss": 1.6998,
"step": 10
},
{
"epoch": 0.01177887832954089,
"grad_norm": 0.42115330696105957,
"learning_rate": 0.0004994582881906825,
"loss": 1.6768,
"step": 11
},
{
"epoch": 0.012849685450408245,
"grad_norm": 0.5151969790458679,
"learning_rate": 0.0004989165763813651,
"loss": 1.7301,
"step": 12
},
{
"epoch": 0.0139204925712756,
"grad_norm": 0.604058563709259,
"learning_rate": 0.0004983748645720476,
"loss": 1.6961,
"step": 13
},
{
"epoch": 0.014991299692142952,
"grad_norm": 0.4526136815547943,
"learning_rate": 0.0004978331527627302,
"loss": 1.6385,
"step": 14
},
{
"epoch": 0.016062106813010308,
"grad_norm": 0.3732638657093048,
"learning_rate": 0.0004972914409534127,
"loss": 1.6927,
"step": 15
},
{
"epoch": 0.01713291393387766,
"grad_norm": 0.39037632942199707,
"learning_rate": 0.0004967497291440954,
"loss": 1.6063,
"step": 16
},
{
"epoch": 0.018203721054745013,
"grad_norm": 0.41541412472724915,
"learning_rate": 0.0004962080173347779,
"loss": 1.6535,
"step": 17
},
{
"epoch": 0.019274528175612368,
"grad_norm": 0.33182990550994873,
"learning_rate": 0.0004956663055254605,
"loss": 1.5564,
"step": 18
},
{
"epoch": 0.020345335296479722,
"grad_norm": 0.3516808748245239,
"learning_rate": 0.0004951245937161431,
"loss": 1.6012,
"step": 19
},
{
"epoch": 0.021416142417347076,
"grad_norm": 0.3928525447845459,
"learning_rate": 0.0004945828819068256,
"loss": 1.6524,
"step": 20
},
{
"epoch": 0.02248694953821443,
"grad_norm": 0.3181082308292389,
"learning_rate": 0.0004940411700975082,
"loss": 1.6055,
"step": 21
},
{
"epoch": 0.02355775665908178,
"grad_norm": 0.30989620089530945,
"learning_rate": 0.0004934994582881907,
"loss": 1.6236,
"step": 22
},
{
"epoch": 0.024628563779949136,
"grad_norm": 0.3335777521133423,
"learning_rate": 0.0004929577464788732,
"loss": 1.6403,
"step": 23
},
{
"epoch": 0.02569937090081649,
"grad_norm": 0.36894136667251587,
"learning_rate": 0.0004924160346695558,
"loss": 1.6778,
"step": 24
},
{
"epoch": 0.026770178021683844,
"grad_norm": 0.3191300928592682,
"learning_rate": 0.0004918743228602383,
"loss": 1.5897,
"step": 25
},
{
"epoch": 0.0278409851425512,
"grad_norm": 0.3290117681026459,
"learning_rate": 0.0004913326110509209,
"loss": 1.6285,
"step": 26
},
{
"epoch": 0.028911792263418553,
"grad_norm": 0.307182252407074,
"learning_rate": 0.0004907908992416034,
"loss": 1.5576,
"step": 27
},
{
"epoch": 0.029982599384285904,
"grad_norm": 0.28709110617637634,
"learning_rate": 0.0004902491874322861,
"loss": 1.6744,
"step": 28
},
{
"epoch": 0.031053406505153258,
"grad_norm": 0.33125609159469604,
"learning_rate": 0.0004897074756229686,
"loss": 1.6106,
"step": 29
},
{
"epoch": 0.032124213626020616,
"grad_norm": 0.31909990310668945,
"learning_rate": 0.0004891657638136512,
"loss": 1.5368,
"step": 30
},
{
"epoch": 0.03319502074688797,
"grad_norm": 0.34221193194389343,
"learning_rate": 0.0004886240520043337,
"loss": 1.6336,
"step": 31
},
{
"epoch": 0.03426582786775532,
"grad_norm": 0.34219980239868164,
"learning_rate": 0.00048808234019501623,
"loss": 1.6243,
"step": 32
},
{
"epoch": 0.035336634988622675,
"grad_norm": 0.29287898540496826,
"learning_rate": 0.0004875406283856988,
"loss": 1.5441,
"step": 33
},
{
"epoch": 0.036407442109490026,
"grad_norm": 0.29403921961784363,
"learning_rate": 0.0004869989165763814,
"loss": 1.651,
"step": 34
},
{
"epoch": 0.037478249230357384,
"grad_norm": 0.3238803446292877,
"learning_rate": 0.00048645720476706396,
"loss": 1.6178,
"step": 35
},
{
"epoch": 0.038549056351224735,
"grad_norm": 0.3332749903202057,
"learning_rate": 0.0004859154929577465,
"loss": 1.5395,
"step": 36
},
{
"epoch": 0.03961986347209209,
"grad_norm": 0.33042415976524353,
"learning_rate": 0.0004853737811484291,
"loss": 1.5116,
"step": 37
},
{
"epoch": 0.040690670592959444,
"grad_norm": 0.32300877571105957,
"learning_rate": 0.00048483206933911164,
"loss": 1.5697,
"step": 38
},
{
"epoch": 0.041761477713826795,
"grad_norm": 0.35760653018951416,
"learning_rate": 0.00048429035752979414,
"loss": 1.629,
"step": 39
},
{
"epoch": 0.04283228483469415,
"grad_norm": 0.3095184564590454,
"learning_rate": 0.0004837486457204767,
"loss": 1.571,
"step": 40
},
{
"epoch": 0.0439030919555615,
"grad_norm": 0.30683574080467224,
"learning_rate": 0.00048320693391115926,
"loss": 1.5357,
"step": 41
},
{
"epoch": 0.04497389907642886,
"grad_norm": 0.33406275510787964,
"learning_rate": 0.0004826652221018418,
"loss": 1.6077,
"step": 42
},
{
"epoch": 0.04604470619729621,
"grad_norm": 0.42627573013305664,
"learning_rate": 0.0004821235102925244,
"loss": 1.5662,
"step": 43
},
{
"epoch": 0.04711551331816356,
"grad_norm": 0.3232003152370453,
"learning_rate": 0.00048158179848320693,
"loss": 1.6063,
"step": 44
},
{
"epoch": 0.04818632043903092,
"grad_norm": 0.4828573763370514,
"learning_rate": 0.0004810400866738895,
"loss": 1.523,
"step": 45
},
{
"epoch": 0.04925712755989827,
"grad_norm": 0.39869874715805054,
"learning_rate": 0.00048049837486457205,
"loss": 1.5844,
"step": 46
},
{
"epoch": 0.05032793468076563,
"grad_norm": 0.36061400175094604,
"learning_rate": 0.0004799566630552546,
"loss": 1.589,
"step": 47
},
{
"epoch": 0.05139874180163298,
"grad_norm": 0.3593485951423645,
"learning_rate": 0.00047941495124593716,
"loss": 1.5149,
"step": 48
},
{
"epoch": 0.05246954892250034,
"grad_norm": 0.3493165373802185,
"learning_rate": 0.0004788732394366197,
"loss": 1.586,
"step": 49
},
{
"epoch": 0.05354035604336769,
"grad_norm": 0.3129478394985199,
"learning_rate": 0.00047833152762730233,
"loss": 1.5374,
"step": 50
},
{
"epoch": 0.05461116316423504,
"grad_norm": 0.3232264816761017,
"learning_rate": 0.00047778981581798484,
"loss": 1.5473,
"step": 51
},
{
"epoch": 0.0556819702851024,
"grad_norm": 0.3314213752746582,
"learning_rate": 0.0004772481040086674,
"loss": 1.5624,
"step": 52
},
{
"epoch": 0.05675277740596975,
"grad_norm": 0.3443197011947632,
"learning_rate": 0.00047670639219934995,
"loss": 1.523,
"step": 53
},
{
"epoch": 0.057823584526837106,
"grad_norm": 0.3222476840019226,
"learning_rate": 0.0004761646803900325,
"loss": 1.6094,
"step": 54
},
{
"epoch": 0.05889439164770446,
"grad_norm": 0.30979102849960327,
"learning_rate": 0.00047562296858071507,
"loss": 1.6053,
"step": 55
},
{
"epoch": 0.05996519876857181,
"grad_norm": 0.3003416061401367,
"learning_rate": 0.00047508125677139763,
"loss": 1.4889,
"step": 56
},
{
"epoch": 0.061036005889439165,
"grad_norm": 0.3053031861782074,
"learning_rate": 0.0004745395449620802,
"loss": 1.5641,
"step": 57
},
{
"epoch": 0.062106813010306516,
"grad_norm": 0.31200629472732544,
"learning_rate": 0.00047399783315276275,
"loss": 1.5857,
"step": 58
},
{
"epoch": 0.06317762013117387,
"grad_norm": 0.3085310757160187,
"learning_rate": 0.0004734561213434453,
"loss": 1.5795,
"step": 59
},
{
"epoch": 0.06424842725204123,
"grad_norm": 0.3053343892097473,
"learning_rate": 0.00047291440953412786,
"loss": 1.48,
"step": 60
},
{
"epoch": 0.06531923437290858,
"grad_norm": 0.31742650270462036,
"learning_rate": 0.0004723726977248104,
"loss": 1.5267,
"step": 61
},
{
"epoch": 0.06639004149377593,
"grad_norm": 0.302557110786438,
"learning_rate": 0.0004718309859154929,
"loss": 1.4835,
"step": 62
},
{
"epoch": 0.06746084861464328,
"grad_norm": 0.3269102871417999,
"learning_rate": 0.0004712892741061755,
"loss": 1.6023,
"step": 63
},
{
"epoch": 0.06853165573551064,
"grad_norm": 0.3242720365524292,
"learning_rate": 0.00047074756229685804,
"loss": 1.6019,
"step": 64
},
{
"epoch": 0.069602462856378,
"grad_norm": 0.3117155134677887,
"learning_rate": 0.00047020585048754065,
"loss": 1.5719,
"step": 65
},
{
"epoch": 0.07067326997724535,
"grad_norm": 0.31575411558151245,
"learning_rate": 0.0004696641386782232,
"loss": 1.5588,
"step": 66
},
{
"epoch": 0.0717440770981127,
"grad_norm": 0.3055570125579834,
"learning_rate": 0.00046912242686890577,
"loss": 1.54,
"step": 67
},
{
"epoch": 0.07281488421898005,
"grad_norm": 0.30278709530830383,
"learning_rate": 0.0004685807150595883,
"loss": 1.4943,
"step": 68
},
{
"epoch": 0.0738856913398474,
"grad_norm": 0.31028270721435547,
"learning_rate": 0.0004680390032502709,
"loss": 1.4901,
"step": 69
},
{
"epoch": 0.07495649846071477,
"grad_norm": 0.3005111515522003,
"learning_rate": 0.00046749729144095344,
"loss": 1.4811,
"step": 70
},
{
"epoch": 0.07602730558158212,
"grad_norm": 0.31970301270484924,
"learning_rate": 0.000466955579631636,
"loss": 1.5812,
"step": 71
},
{
"epoch": 0.07709811270244947,
"grad_norm": 0.31910890340805054,
"learning_rate": 0.00046641386782231856,
"loss": 1.5398,
"step": 72
},
{
"epoch": 0.07816891982331682,
"grad_norm": 0.34352612495422363,
"learning_rate": 0.0004658721560130011,
"loss": 1.6016,
"step": 73
},
{
"epoch": 0.07923972694418419,
"grad_norm": 0.3307402729988098,
"learning_rate": 0.0004653304442036836,
"loss": 1.5357,
"step": 74
},
{
"epoch": 0.08031053406505154,
"grad_norm": 0.31802475452423096,
"learning_rate": 0.0004647887323943662,
"loss": 1.5463,
"step": 75
},
{
"epoch": 0.08138134118591889,
"grad_norm": 0.3045582175254822,
"learning_rate": 0.00046424702058504874,
"loss": 1.4936,
"step": 76
},
{
"epoch": 0.08245214830678624,
"grad_norm": 0.3408415913581848,
"learning_rate": 0.0004637053087757313,
"loss": 1.526,
"step": 77
},
{
"epoch": 0.08352295542765359,
"grad_norm": 0.3176616430282593,
"learning_rate": 0.00046316359696641385,
"loss": 1.5581,
"step": 78
},
{
"epoch": 0.08459376254852095,
"grad_norm": 0.3179102838039398,
"learning_rate": 0.0004626218851570964,
"loss": 1.5525,
"step": 79
},
{
"epoch": 0.0856645696693883,
"grad_norm": 0.3425735831260681,
"learning_rate": 0.00046208017334777897,
"loss": 1.4914,
"step": 80
},
{
"epoch": 0.08673537679025566,
"grad_norm": 0.36185234785079956,
"learning_rate": 0.0004615384615384616,
"loss": 1.5293,
"step": 81
},
{
"epoch": 0.087806183911123,
"grad_norm": 0.3470607399940491,
"learning_rate": 0.00046099674972914414,
"loss": 1.5388,
"step": 82
},
{
"epoch": 0.08887699103199036,
"grad_norm": 0.3171769976615906,
"learning_rate": 0.0004604550379198267,
"loss": 1.4932,
"step": 83
},
{
"epoch": 0.08994779815285772,
"grad_norm": 0.3396613895893097,
"learning_rate": 0.00045991332611050926,
"loss": 1.5367,
"step": 84
},
{
"epoch": 0.09101860527372507,
"grad_norm": 0.3147753179073334,
"learning_rate": 0.0004593716143011918,
"loss": 1.5413,
"step": 85
},
{
"epoch": 0.09208941239459242,
"grad_norm": 0.3213801383972168,
"learning_rate": 0.0004588299024918743,
"loss": 1.4544,
"step": 86
},
{
"epoch": 0.09316021951545977,
"grad_norm": 0.3900924623012543,
"learning_rate": 0.0004582881906825569,
"loss": 1.5155,
"step": 87
},
{
"epoch": 0.09423102663632713,
"grad_norm": 0.34930315613746643,
"learning_rate": 0.00045774647887323943,
"loss": 1.5323,
"step": 88
},
{
"epoch": 0.09530183375719449,
"grad_norm": 0.32511013746261597,
"learning_rate": 0.000457204767063922,
"loss": 1.484,
"step": 89
},
{
"epoch": 0.09637264087806184,
"grad_norm": 0.3209106922149658,
"learning_rate": 0.00045666305525460455,
"loss": 1.4659,
"step": 90
},
{
"epoch": 0.09744344799892919,
"grad_norm": 0.3438887298107147,
"learning_rate": 0.0004561213434452871,
"loss": 1.522,
"step": 91
},
{
"epoch": 0.09851425511979654,
"grad_norm": 0.5644230842590332,
"learning_rate": 0.00045557963163596967,
"loss": 1.5703,
"step": 92
},
{
"epoch": 0.0995850622406639,
"grad_norm": 0.35866114497184753,
"learning_rate": 0.0004550379198266522,
"loss": 1.5637,
"step": 93
},
{
"epoch": 0.10065586936153126,
"grad_norm": 0.3141271770000458,
"learning_rate": 0.0004544962080173348,
"loss": 1.5275,
"step": 94
},
{
"epoch": 0.10172667648239861,
"grad_norm": 0.3229062557220459,
"learning_rate": 0.00045395449620801734,
"loss": 1.509,
"step": 95
},
{
"epoch": 0.10279748360326596,
"grad_norm": 0.3184738755226135,
"learning_rate": 0.0004534127843986999,
"loss": 1.5243,
"step": 96
},
{
"epoch": 0.10386829072413331,
"grad_norm": 0.33315855264663696,
"learning_rate": 0.00045287107258938246,
"loss": 1.4969,
"step": 97
},
{
"epoch": 0.10493909784500068,
"grad_norm": 0.37624651193618774,
"learning_rate": 0.000452329360780065,
"loss": 1.5713,
"step": 98
},
{
"epoch": 0.10600990496586803,
"grad_norm": 0.3466942608356476,
"learning_rate": 0.0004517876489707476,
"loss": 1.4497,
"step": 99
},
{
"epoch": 0.10708071208673538,
"grad_norm": 0.3428940773010254,
"learning_rate": 0.00045124593716143013,
"loss": 1.5272,
"step": 100
},
{
"epoch": 0.10815151920760273,
"grad_norm": 0.32997605204582214,
"learning_rate": 0.0004507042253521127,
"loss": 1.5664,
"step": 101
},
{
"epoch": 0.10922232632847008,
"grad_norm": 0.35048359632492065,
"learning_rate": 0.00045016251354279525,
"loss": 1.4883,
"step": 102
},
{
"epoch": 0.11029313344933744,
"grad_norm": 0.3379492461681366,
"learning_rate": 0.0004496208017334778,
"loss": 1.4706,
"step": 103
},
{
"epoch": 0.1113639405702048,
"grad_norm": 0.36966028809547424,
"learning_rate": 0.00044907908992416036,
"loss": 1.5116,
"step": 104
},
{
"epoch": 0.11243474769107215,
"grad_norm": 0.3487953245639801,
"learning_rate": 0.0004485373781148429,
"loss": 1.5147,
"step": 105
},
{
"epoch": 0.1135055548119395,
"grad_norm": 0.3422049582004547,
"learning_rate": 0.0004479956663055255,
"loss": 1.4782,
"step": 106
},
{
"epoch": 0.11457636193280685,
"grad_norm": 0.3196428716182709,
"learning_rate": 0.00044745395449620804,
"loss": 1.4375,
"step": 107
},
{
"epoch": 0.11564716905367421,
"grad_norm": 0.3369114398956299,
"learning_rate": 0.00044691224268689054,
"loss": 1.5261,
"step": 108
},
{
"epoch": 0.11671797617454156,
"grad_norm": 0.35993748903274536,
"learning_rate": 0.0004463705308775731,
"loss": 1.5136,
"step": 109
},
{
"epoch": 0.11778878329540891,
"grad_norm": 0.3427882790565491,
"learning_rate": 0.00044582881906825566,
"loss": 1.5352,
"step": 110
},
{
"epoch": 0.11885959041627626,
"grad_norm": 0.3308979570865631,
"learning_rate": 0.0004452871072589382,
"loss": 1.4979,
"step": 111
},
{
"epoch": 0.11993039753714362,
"grad_norm": 0.3407396376132965,
"learning_rate": 0.00044474539544962083,
"loss": 1.5055,
"step": 112
},
{
"epoch": 0.12100120465801098,
"grad_norm": 0.34919309616088867,
"learning_rate": 0.0004442036836403034,
"loss": 1.5032,
"step": 113
},
{
"epoch": 0.12207201177887833,
"grad_norm": 0.34088361263275146,
"learning_rate": 0.00044366197183098594,
"loss": 1.5489,
"step": 114
},
{
"epoch": 0.12314281889974568,
"grad_norm": 0.3275073766708374,
"learning_rate": 0.0004431202600216685,
"loss": 1.4882,
"step": 115
},
{
"epoch": 0.12421362602061303,
"grad_norm": 0.35690388083457947,
"learning_rate": 0.00044257854821235106,
"loss": 1.4762,
"step": 116
},
{
"epoch": 0.12528443314148038,
"grad_norm": 0.668167233467102,
"learning_rate": 0.0004420368364030336,
"loss": 1.5231,
"step": 117
},
{
"epoch": 0.12635524026234773,
"grad_norm": 0.3807876408100128,
"learning_rate": 0.0004414951245937162,
"loss": 1.5125,
"step": 118
},
{
"epoch": 0.12742604738321509,
"grad_norm": 0.32847508788108826,
"learning_rate": 0.00044095341278439874,
"loss": 1.4791,
"step": 119
},
{
"epoch": 0.12849685450408246,
"grad_norm": 0.34058675169944763,
"learning_rate": 0.00044041170097508124,
"loss": 1.4917,
"step": 120
},
{
"epoch": 0.12956766162494981,
"grad_norm": 0.3316013216972351,
"learning_rate": 0.0004398699891657638,
"loss": 1.5397,
"step": 121
},
{
"epoch": 0.13063846874581717,
"grad_norm": 0.32970407605171204,
"learning_rate": 0.00043932827735644636,
"loss": 1.56,
"step": 122
},
{
"epoch": 0.13170927586668452,
"grad_norm": 0.3216981887817383,
"learning_rate": 0.0004387865655471289,
"loss": 1.4856,
"step": 123
},
{
"epoch": 0.13278008298755187,
"grad_norm": 0.3492419421672821,
"learning_rate": 0.00043824485373781147,
"loss": 1.4941,
"step": 124
},
{
"epoch": 0.13385089010841922,
"grad_norm": 0.3463359475135803,
"learning_rate": 0.00043770314192849403,
"loss": 1.5003,
"step": 125
},
{
"epoch": 0.13492169722928657,
"grad_norm": 0.3727024793624878,
"learning_rate": 0.0004371614301191766,
"loss": 1.4981,
"step": 126
},
{
"epoch": 0.13599250435015392,
"grad_norm": 0.5523554086685181,
"learning_rate": 0.00043661971830985915,
"loss": 1.5786,
"step": 127
},
{
"epoch": 0.13706331147102127,
"grad_norm": 0.32683220505714417,
"learning_rate": 0.00043607800650054176,
"loss": 1.4902,
"step": 128
},
{
"epoch": 0.13813411859188865,
"grad_norm": 0.3415539562702179,
"learning_rate": 0.0004355362946912243,
"loss": 1.4875,
"step": 129
},
{
"epoch": 0.139204925712756,
"grad_norm": 0.3191353976726532,
"learning_rate": 0.0004349945828819069,
"loss": 1.4759,
"step": 130
},
{
"epoch": 0.14027573283362335,
"grad_norm": 0.35508468747138977,
"learning_rate": 0.00043445287107258943,
"loss": 1.5611,
"step": 131
},
{
"epoch": 0.1413465399544907,
"grad_norm": 0.33212971687316895,
"learning_rate": 0.00043391115926327194,
"loss": 1.4522,
"step": 132
},
{
"epoch": 0.14241734707535805,
"grad_norm": 0.3219762146472931,
"learning_rate": 0.0004333694474539545,
"loss": 1.4582,
"step": 133
},
{
"epoch": 0.1434881541962254,
"grad_norm": 0.36882877349853516,
"learning_rate": 0.00043282773564463705,
"loss": 1.5347,
"step": 134
},
{
"epoch": 0.14455896131709275,
"grad_norm": 0.33573803305625916,
"learning_rate": 0.0004322860238353196,
"loss": 1.4876,
"step": 135
},
{
"epoch": 0.1456297684379601,
"grad_norm": 0.33557966351509094,
"learning_rate": 0.00043174431202600217,
"loss": 1.4536,
"step": 136
},
{
"epoch": 0.14670057555882746,
"grad_norm": 0.3364240527153015,
"learning_rate": 0.0004312026002166847,
"loss": 1.5241,
"step": 137
},
{
"epoch": 0.1477713826796948,
"grad_norm": 0.31000298261642456,
"learning_rate": 0.0004306608884073673,
"loss": 1.4427,
"step": 138
},
{
"epoch": 0.14884218980056219,
"grad_norm": 0.31178000569343567,
"learning_rate": 0.00043011917659804984,
"loss": 1.5455,
"step": 139
},
{
"epoch": 0.14991299692142954,
"grad_norm": 0.3283156752586365,
"learning_rate": 0.0004295774647887324,
"loss": 1.5277,
"step": 140
},
{
"epoch": 0.1509838040422969,
"grad_norm": 0.34077680110931396,
"learning_rate": 0.00042903575297941496,
"loss": 1.5203,
"step": 141
},
{
"epoch": 0.15205461116316424,
"grad_norm": 0.3414633870124817,
"learning_rate": 0.0004284940411700975,
"loss": 1.5143,
"step": 142
},
{
"epoch": 0.1531254182840316,
"grad_norm": 0.3262156844139099,
"learning_rate": 0.0004279523293607801,
"loss": 1.492,
"step": 143
},
{
"epoch": 0.15419622540489894,
"grad_norm": 0.3537783920764923,
"learning_rate": 0.00042741061755146263,
"loss": 1.5223,
"step": 144
},
{
"epoch": 0.1552670325257663,
"grad_norm": 0.339911550283432,
"learning_rate": 0.0004268689057421452,
"loss": 1.5162,
"step": 145
},
{
"epoch": 0.15633783964663364,
"grad_norm": 0.36946552991867065,
"learning_rate": 0.00042632719393282775,
"loss": 1.4668,
"step": 146
},
{
"epoch": 0.157408646767501,
"grad_norm": 0.33070170879364014,
"learning_rate": 0.0004257854821235103,
"loss": 1.4606,
"step": 147
},
{
"epoch": 0.15847945388836837,
"grad_norm": 0.33413979411125183,
"learning_rate": 0.00042524377031419287,
"loss": 1.5032,
"step": 148
},
{
"epoch": 0.15955026100923572,
"grad_norm": 0.3402380049228668,
"learning_rate": 0.0004247020585048754,
"loss": 1.52,
"step": 149
},
{
"epoch": 0.16062106813010307,
"grad_norm": 0.3602783679962158,
"learning_rate": 0.000424160346695558,
"loss": 1.5349,
"step": 150
},
{
"epoch": 0.16169187525097042,
"grad_norm": 0.32968804240226746,
"learning_rate": 0.00042361863488624054,
"loss": 1.4369,
"step": 151
},
{
"epoch": 0.16276268237183777,
"grad_norm": 0.3444564938545227,
"learning_rate": 0.0004230769230769231,
"loss": 1.4565,
"step": 152
},
{
"epoch": 0.16383348949270513,
"grad_norm": 0.37572184205055237,
"learning_rate": 0.00042253521126760566,
"loss": 1.4921,
"step": 153
},
{
"epoch": 0.16490429661357248,
"grad_norm": 0.3675267994403839,
"learning_rate": 0.0004219934994582882,
"loss": 1.5345,
"step": 154
},
{
"epoch": 0.16597510373443983,
"grad_norm": 0.34972381591796875,
"learning_rate": 0.0004214517876489707,
"loss": 1.4759,
"step": 155
},
{
"epoch": 0.16704591085530718,
"grad_norm": 0.35719773173332214,
"learning_rate": 0.0004209100758396533,
"loss": 1.5401,
"step": 156
},
{
"epoch": 0.16811671797617453,
"grad_norm": 0.3391767144203186,
"learning_rate": 0.00042036836403033583,
"loss": 1.5129,
"step": 157
},
{
"epoch": 0.1691875250970419,
"grad_norm": 0.34171062707901,
"learning_rate": 0.0004198266522210184,
"loss": 1.5304,
"step": 158
},
{
"epoch": 0.17025833221790926,
"grad_norm": 0.3329889476299286,
"learning_rate": 0.000419284940411701,
"loss": 1.4794,
"step": 159
},
{
"epoch": 0.1713291393387766,
"grad_norm": 0.329875111579895,
"learning_rate": 0.00041874322860238356,
"loss": 1.4658,
"step": 160
},
{
"epoch": 0.17239994645964396,
"grad_norm": 0.36654773354530334,
"learning_rate": 0.0004182015167930661,
"loss": 1.5079,
"step": 161
},
{
"epoch": 0.1734707535805113,
"grad_norm": 0.3587745130062103,
"learning_rate": 0.0004176598049837487,
"loss": 1.4352,
"step": 162
},
{
"epoch": 0.17454156070137866,
"grad_norm": 0.32216113805770874,
"learning_rate": 0.00041711809317443124,
"loss": 1.4214,
"step": 163
},
{
"epoch": 0.175612367822246,
"grad_norm": 0.34425267577171326,
"learning_rate": 0.0004165763813651138,
"loss": 1.5408,
"step": 164
},
{
"epoch": 0.17668317494311336,
"grad_norm": 0.34980979561805725,
"learning_rate": 0.00041603466955579635,
"loss": 1.4995,
"step": 165
},
{
"epoch": 0.17775398206398071,
"grad_norm": 0.33706167340278625,
"learning_rate": 0.00041549295774647886,
"loss": 1.4966,
"step": 166
},
{
"epoch": 0.1788247891848481,
"grad_norm": 0.3577290177345276,
"learning_rate": 0.0004149512459371614,
"loss": 1.5051,
"step": 167
},
{
"epoch": 0.17989559630571544,
"grad_norm": 0.33480167388916016,
"learning_rate": 0.000414409534127844,
"loss": 1.4846,
"step": 168
},
{
"epoch": 0.1809664034265828,
"grad_norm": 0.3389778137207031,
"learning_rate": 0.00041386782231852653,
"loss": 1.4659,
"step": 169
},
{
"epoch": 0.18203721054745015,
"grad_norm": 0.34035906195640564,
"learning_rate": 0.0004133261105092091,
"loss": 1.5269,
"step": 170
},
{
"epoch": 0.1831080176683175,
"grad_norm": 0.33953285217285156,
"learning_rate": 0.00041278439869989165,
"loss": 1.5608,
"step": 171
},
{
"epoch": 0.18417882478918485,
"grad_norm": 0.331253319978714,
"learning_rate": 0.0004122426868905742,
"loss": 1.4238,
"step": 172
},
{
"epoch": 0.1852496319100522,
"grad_norm": 0.3417370915412903,
"learning_rate": 0.00041170097508125676,
"loss": 1.5335,
"step": 173
},
{
"epoch": 0.18632043903091955,
"grad_norm": 0.3459537923336029,
"learning_rate": 0.0004111592632719393,
"loss": 1.5405,
"step": 174
},
{
"epoch": 0.1873912461517869,
"grad_norm": 0.34250974655151367,
"learning_rate": 0.00041061755146262193,
"loss": 1.5451,
"step": 175
},
{
"epoch": 0.18846205327265425,
"grad_norm": 0.35121142864227295,
"learning_rate": 0.0004100758396533045,
"loss": 1.4584,
"step": 176
},
{
"epoch": 0.18953286039352163,
"grad_norm": 0.3343502879142761,
"learning_rate": 0.00040953412784398705,
"loss": 1.4967,
"step": 177
},
{
"epoch": 0.19060366751438898,
"grad_norm": 0.3440572917461395,
"learning_rate": 0.00040899241603466955,
"loss": 1.5322,
"step": 178
},
{
"epoch": 0.19167447463525633,
"grad_norm": 0.3478721082210541,
"learning_rate": 0.0004084507042253521,
"loss": 1.4887,
"step": 179
},
{
"epoch": 0.19274528175612368,
"grad_norm": 0.3297663927078247,
"learning_rate": 0.00040790899241603467,
"loss": 1.4321,
"step": 180
},
{
"epoch": 0.19381608887699103,
"grad_norm": 0.3527899384498596,
"learning_rate": 0.00040736728060671723,
"loss": 1.5411,
"step": 181
},
{
"epoch": 0.19488689599785838,
"grad_norm": 0.3361954987049103,
"learning_rate": 0.0004068255687973998,
"loss": 1.4383,
"step": 182
},
{
"epoch": 0.19595770311872573,
"grad_norm": 0.35988926887512207,
"learning_rate": 0.00040628385698808235,
"loss": 1.4807,
"step": 183
},
{
"epoch": 0.19702851023959309,
"grad_norm": 0.35412025451660156,
"learning_rate": 0.0004057421451787649,
"loss": 1.5432,
"step": 184
},
{
"epoch": 0.19809931736046044,
"grad_norm": 0.3374565541744232,
"learning_rate": 0.00040520043336944746,
"loss": 1.4895,
"step": 185
},
{
"epoch": 0.1991701244813278,
"grad_norm": 0.35347357392311096,
"learning_rate": 0.00040465872156013,
"loss": 1.4761,
"step": 186
},
{
"epoch": 0.20024093160219517,
"grad_norm": 0.34612298011779785,
"learning_rate": 0.0004041170097508126,
"loss": 1.4867,
"step": 187
},
{
"epoch": 0.20131173872306252,
"grad_norm": 0.36123159527778625,
"learning_rate": 0.00040357529794149514,
"loss": 1.4753,
"step": 188
},
{
"epoch": 0.20238254584392987,
"grad_norm": 0.37735962867736816,
"learning_rate": 0.00040303358613217764,
"loss": 1.5158,
"step": 189
},
{
"epoch": 0.20345335296479722,
"grad_norm": 0.365067720413208,
"learning_rate": 0.00040249187432286025,
"loss": 1.5493,
"step": 190
},
{
"epoch": 0.20452416008566457,
"grad_norm": 0.33235374093055725,
"learning_rate": 0.0004019501625135428,
"loss": 1.495,
"step": 191
},
{
"epoch": 0.20559496720653192,
"grad_norm": 0.35279738903045654,
"learning_rate": 0.00040140845070422537,
"loss": 1.4681,
"step": 192
},
{
"epoch": 0.20666577432739927,
"grad_norm": 0.342896968126297,
"learning_rate": 0.0004008667388949079,
"loss": 1.5163,
"step": 193
},
{
"epoch": 0.20773658144826662,
"grad_norm": 0.34132811427116394,
"learning_rate": 0.0004003250270855905,
"loss": 1.4822,
"step": 194
},
{
"epoch": 0.20880738856913397,
"grad_norm": 0.34202563762664795,
"learning_rate": 0.00039978331527627304,
"loss": 1.44,
"step": 195
},
{
"epoch": 0.20987819569000135,
"grad_norm": 0.3383086919784546,
"learning_rate": 0.0003992416034669556,
"loss": 1.4993,
"step": 196
},
{
"epoch": 0.2109490028108687,
"grad_norm": 0.35314062237739563,
"learning_rate": 0.00039869989165763816,
"loss": 1.5139,
"step": 197
},
{
"epoch": 0.21201980993173605,
"grad_norm": 0.3365531265735626,
"learning_rate": 0.0003981581798483207,
"loss": 1.429,
"step": 198
},
{
"epoch": 0.2130906170526034,
"grad_norm": 0.33675894141197205,
"learning_rate": 0.0003976164680390033,
"loss": 1.4568,
"step": 199
},
{
"epoch": 0.21416142417347075,
"grad_norm": 0.340620219707489,
"learning_rate": 0.00039707475622968583,
"loss": 1.4935,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 933,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.528726582329344e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}