qwen25_14ft / trainer_state.json

Upload folder using huggingface_hub

acb9ddb verified over 1 year ago

35.4 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.21416142417347075,
	"eval_steps": 500,
	"global_step": 200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0010708071208673537,
	"grad_norm": 3.1792166233062744,
	"learning_rate": 5e-05,
	"loss": 2.9696,
	"step": 1
	},
	{
	"epoch": 0.0021416142417347074,
	"grad_norm": 3.353651285171509,
	"learning_rate": 0.0001,
	"loss": 3.0758,
	"step": 2
	},
	{
	"epoch": 0.0032124213626020613,
	"grad_norm": 1.886090874671936,
	"learning_rate": 0.00015,
	"loss": 2.7586,
	"step": 3
	},
	{
	"epoch": 0.004283228483469415,
	"grad_norm": 1.451682686805725,
	"learning_rate": 0.0002,
	"loss": 2.5324,
	"step": 4
	},
	{
	"epoch": 0.005354035604336769,
	"grad_norm": 1.175742268562317,
	"learning_rate": 0.00025,
	"loss": 2.2141,
	"step": 5
	},
	{
	"epoch": 0.0064248427252041225,
	"grad_norm": 0.868193507194519,
	"learning_rate": 0.0003,
	"loss": 2.0835,
	"step": 6
	},
	{
	"epoch": 0.007495649846071476,
	"grad_norm": 1.0772305727005005,
	"learning_rate": 0.00035,
	"loss": 1.9921,
	"step": 7
	},
	{
	"epoch": 0.00856645696693883,
	"grad_norm": 1.069272518157959,
	"learning_rate": 0.0004,
	"loss": 1.9016,
	"step": 8
	},
	{
	"epoch": 0.009637264087806184,
	"grad_norm": 0.7301461100578308,
	"learning_rate": 0.00045000000000000004,
	"loss": 1.8262,
	"step": 9
	},
	{
	"epoch": 0.010708071208673538,
	"grad_norm": 0.49968260526657104,
	"learning_rate": 0.0005,
	"loss": 1.6998,
	"step": 10
	},
	{
	"epoch": 0.01177887832954089,
	"grad_norm": 0.42115330696105957,
	"learning_rate": 0.0004994582881906825,
	"loss": 1.6768,
	"step": 11
	},
	{
	"epoch": 0.012849685450408245,
	"grad_norm": 0.5151969790458679,
	"learning_rate": 0.0004989165763813651,
	"loss": 1.7301,
	"step": 12
	},
	{
	"epoch": 0.0139204925712756,
	"grad_norm": 0.604058563709259,
	"learning_rate": 0.0004983748645720476,
	"loss": 1.6961,
	"step": 13
	},
	{
	"epoch": 0.014991299692142952,
	"grad_norm": 0.4526136815547943,
	"learning_rate": 0.0004978331527627302,
	"loss": 1.6385,
	"step": 14
	},
	{
	"epoch": 0.016062106813010308,
	"grad_norm": 0.3732638657093048,
	"learning_rate": 0.0004972914409534127,
	"loss": 1.6927,
	"step": 15
	},
	{
	"epoch": 0.01713291393387766,
	"grad_norm": 0.39037632942199707,
	"learning_rate": 0.0004967497291440954,
	"loss": 1.6063,
	"step": 16
	},
	{
	"epoch": 0.018203721054745013,
	"grad_norm": 0.41541412472724915,
	"learning_rate": 0.0004962080173347779,
	"loss": 1.6535,
	"step": 17
	},
	{
	"epoch": 0.019274528175612368,
	"grad_norm": 0.33182990550994873,
	"learning_rate": 0.0004956663055254605,
	"loss": 1.5564,
	"step": 18
	},
	{
	"epoch": 0.020345335296479722,
	"grad_norm": 0.3516808748245239,
	"learning_rate": 0.0004951245937161431,
	"loss": 1.6012,
	"step": 19
	},
	{
	"epoch": 0.021416142417347076,
	"grad_norm": 0.3928525447845459,
	"learning_rate": 0.0004945828819068256,
	"loss": 1.6524,
	"step": 20
	},
	{
	"epoch": 0.02248694953821443,
	"grad_norm": 0.3181082308292389,
	"learning_rate": 0.0004940411700975082,
	"loss": 1.6055,
	"step": 21
	},
	{
	"epoch": 0.02355775665908178,
	"grad_norm": 0.30989620089530945,
	"learning_rate": 0.0004934994582881907,
	"loss": 1.6236,
	"step": 22
	},
	{
	"epoch": 0.024628563779949136,
	"grad_norm": 0.3335777521133423,
	"learning_rate": 0.0004929577464788732,
	"loss": 1.6403,
	"step": 23
	},
	{
	"epoch": 0.02569937090081649,
	"grad_norm": 0.36894136667251587,
	"learning_rate": 0.0004924160346695558,
	"loss": 1.6778,
	"step": 24
	},
	{
	"epoch": 0.026770178021683844,
	"grad_norm": 0.3191300928592682,
	"learning_rate": 0.0004918743228602383,
	"loss": 1.5897,
	"step": 25
	},
	{
	"epoch": 0.0278409851425512,
	"grad_norm": 0.3290117681026459,
	"learning_rate": 0.0004913326110509209,
	"loss": 1.6285,
	"step": 26
	},
	{
	"epoch": 0.028911792263418553,
	"grad_norm": 0.307182252407074,
	"learning_rate": 0.0004907908992416034,
	"loss": 1.5576,
	"step": 27
	},
	{
	"epoch": 0.029982599384285904,
	"grad_norm": 0.28709110617637634,
	"learning_rate": 0.0004902491874322861,
	"loss": 1.6744,
	"step": 28
	},
	{
	"epoch": 0.031053406505153258,
	"grad_norm": 0.33125609159469604,
	"learning_rate": 0.0004897074756229686,
	"loss": 1.6106,
	"step": 29
	},
	{
	"epoch": 0.032124213626020616,
	"grad_norm": 0.31909990310668945,
	"learning_rate": 0.0004891657638136512,
	"loss": 1.5368,
	"step": 30
	},
	{
	"epoch": 0.03319502074688797,
	"grad_norm": 0.34221193194389343,
	"learning_rate": 0.0004886240520043337,
	"loss": 1.6336,
	"step": 31
	},
	{
	"epoch": 0.03426582786775532,
	"grad_norm": 0.34219980239868164,
	"learning_rate": 0.00048808234019501623,
	"loss": 1.6243,
	"step": 32
	},
	{
	"epoch": 0.035336634988622675,
	"grad_norm": 0.29287898540496826,
	"learning_rate": 0.0004875406283856988,
	"loss": 1.5441,
	"step": 33
	},
	{
	"epoch": 0.036407442109490026,
	"grad_norm": 0.29403921961784363,
	"learning_rate": 0.0004869989165763814,
	"loss": 1.651,
	"step": 34
	},
	{
	"epoch": 0.037478249230357384,
	"grad_norm": 0.3238803446292877,
	"learning_rate": 0.00048645720476706396,
	"loss": 1.6178,
	"step": 35
	},
	{
	"epoch": 0.038549056351224735,
	"grad_norm": 0.3332749903202057,
	"learning_rate": 0.0004859154929577465,
	"loss": 1.5395,
	"step": 36
	},
	{
	"epoch": 0.03961986347209209,
	"grad_norm": 0.33042415976524353,
	"learning_rate": 0.0004853737811484291,
	"loss": 1.5116,
	"step": 37
	},
	{
	"epoch": 0.040690670592959444,
	"grad_norm": 0.32300877571105957,
	"learning_rate": 0.00048483206933911164,
	"loss": 1.5697,
	"step": 38
	},
	{
	"epoch": 0.041761477713826795,
	"grad_norm": 0.35760653018951416,
	"learning_rate": 0.00048429035752979414,
	"loss": 1.629,
	"step": 39
	},
	{
	"epoch": 0.04283228483469415,
	"grad_norm": 0.3095184564590454,
	"learning_rate": 0.0004837486457204767,
	"loss": 1.571,
	"step": 40
	},
	{
	"epoch": 0.0439030919555615,
	"grad_norm": 0.30683574080467224,
	"learning_rate": 0.00048320693391115926,
	"loss": 1.5357,
	"step": 41
	},
	{
	"epoch": 0.04497389907642886,
	"grad_norm": 0.33406275510787964,
	"learning_rate": 0.0004826652221018418,
	"loss": 1.6077,
	"step": 42
	},
	{
	"epoch": 0.04604470619729621,
	"grad_norm": 0.42627573013305664,
	"learning_rate": 0.0004821235102925244,
	"loss": 1.5662,
	"step": 43
	},
	{
	"epoch": 0.04711551331816356,
	"grad_norm": 0.3232003152370453,
	"learning_rate": 0.00048158179848320693,
	"loss": 1.6063,
	"step": 44
	},
	{
	"epoch": 0.04818632043903092,
	"grad_norm": 0.4828573763370514,
	"learning_rate": 0.0004810400866738895,
	"loss": 1.523,
	"step": 45
	},
	{
	"epoch": 0.04925712755989827,
	"grad_norm": 0.39869874715805054,
	"learning_rate": 0.00048049837486457205,
	"loss": 1.5844,
	"step": 46
	},
	{
	"epoch": 0.05032793468076563,
	"grad_norm": 0.36061400175094604,
	"learning_rate": 0.0004799566630552546,
	"loss": 1.589,
	"step": 47
	},
	{
	"epoch": 0.05139874180163298,
	"grad_norm": 0.3593485951423645,
	"learning_rate": 0.00047941495124593716,
	"loss": 1.5149,
	"step": 48
	},
	{
	"epoch": 0.05246954892250034,
	"grad_norm": 0.3493165373802185,
	"learning_rate": 0.0004788732394366197,
	"loss": 1.586,
	"step": 49
	},
	{
	"epoch": 0.05354035604336769,
	"grad_norm": 0.3129478394985199,
	"learning_rate": 0.00047833152762730233,
	"loss": 1.5374,
	"step": 50
	},
	{
	"epoch": 0.05461116316423504,
	"grad_norm": 0.3232264816761017,
	"learning_rate": 0.00047778981581798484,
	"loss": 1.5473,
	"step": 51
	},
	{
	"epoch": 0.0556819702851024,
	"grad_norm": 0.3314213752746582,
	"learning_rate": 0.0004772481040086674,
	"loss": 1.5624,
	"step": 52
	},
	{
	"epoch": 0.05675277740596975,
	"grad_norm": 0.3443197011947632,
	"learning_rate": 0.00047670639219934995,
	"loss": 1.523,
	"step": 53
	},
	{
	"epoch": 0.057823584526837106,
	"grad_norm": 0.3222476840019226,
	"learning_rate": 0.0004761646803900325,
	"loss": 1.6094,
	"step": 54
	},
	{
	"epoch": 0.05889439164770446,
	"grad_norm": 0.30979102849960327,
	"learning_rate": 0.00047562296858071507,
	"loss": 1.6053,
	"step": 55
	},
	{
	"epoch": 0.05996519876857181,
	"grad_norm": 0.3003416061401367,
	"learning_rate": 0.00047508125677139763,
	"loss": 1.4889,
	"step": 56
	},
	{
	"epoch": 0.061036005889439165,
	"grad_norm": 0.3053031861782074,
	"learning_rate": 0.0004745395449620802,
	"loss": 1.5641,
	"step": 57
	},
	{
	"epoch": 0.062106813010306516,
	"grad_norm": 0.31200629472732544,
	"learning_rate": 0.00047399783315276275,
	"loss": 1.5857,
	"step": 58
	},
	{
	"epoch": 0.06317762013117387,
	"grad_norm": 0.3085310757160187,
	"learning_rate": 0.0004734561213434453,
	"loss": 1.5795,
	"step": 59
	},
	{
	"epoch": 0.06424842725204123,
	"grad_norm": 0.3053343892097473,
	"learning_rate": 0.00047291440953412786,
	"loss": 1.48,
	"step": 60
	},
	{
	"epoch": 0.06531923437290858,
	"grad_norm": 0.31742650270462036,
	"learning_rate": 0.0004723726977248104,
	"loss": 1.5267,
	"step": 61
	},
	{
	"epoch": 0.06639004149377593,
	"grad_norm": 0.302557110786438,
	"learning_rate": 0.0004718309859154929,
	"loss": 1.4835,
	"step": 62
	},
	{
	"epoch": 0.06746084861464328,
	"grad_norm": 0.3269102871417999,
	"learning_rate": 0.0004712892741061755,
	"loss": 1.6023,
	"step": 63
	},
	{
	"epoch": 0.06853165573551064,
	"grad_norm": 0.3242720365524292,
	"learning_rate": 0.00047074756229685804,
	"loss": 1.6019,
	"step": 64
	},
	{
	"epoch": 0.069602462856378,
	"grad_norm": 0.3117155134677887,
	"learning_rate": 0.00047020585048754065,
	"loss": 1.5719,
	"step": 65
	},
	{
	"epoch": 0.07067326997724535,
	"grad_norm": 0.31575411558151245,
	"learning_rate": 0.0004696641386782232,
	"loss": 1.5588,
	"step": 66
	},
	{
	"epoch": 0.0717440770981127,
	"grad_norm": 0.3055570125579834,
	"learning_rate": 0.00046912242686890577,
	"loss": 1.54,
	"step": 67
	},
	{
	"epoch": 0.07281488421898005,
	"grad_norm": 0.30278709530830383,
	"learning_rate": 0.0004685807150595883,
	"loss": 1.4943,
	"step": 68
	},
	{
	"epoch": 0.0738856913398474,
	"grad_norm": 0.31028270721435547,
	"learning_rate": 0.0004680390032502709,
	"loss": 1.4901,
	"step": 69
	},
	{
	"epoch": 0.07495649846071477,
	"grad_norm": 0.3005111515522003,
	"learning_rate": 0.00046749729144095344,
	"loss": 1.4811,
	"step": 70
	},
	{
	"epoch": 0.07602730558158212,
	"grad_norm": 0.31970301270484924,
	"learning_rate": 0.000466955579631636,
	"loss": 1.5812,
	"step": 71
	},
	{
	"epoch": 0.07709811270244947,
	"grad_norm": 0.31910890340805054,
	"learning_rate": 0.00046641386782231856,
	"loss": 1.5398,
	"step": 72
	},
	{
	"epoch": 0.07816891982331682,
	"grad_norm": 0.34352612495422363,
	"learning_rate": 0.0004658721560130011,
	"loss": 1.6016,
	"step": 73
	},
	{
	"epoch": 0.07923972694418419,
	"grad_norm": 0.3307402729988098,
	"learning_rate": 0.0004653304442036836,
	"loss": 1.5357,
	"step": 74
	},
	{
	"epoch": 0.08031053406505154,
	"grad_norm": 0.31802475452423096,
	"learning_rate": 0.0004647887323943662,
	"loss": 1.5463,
	"step": 75
	},
	{
	"epoch": 0.08138134118591889,
	"grad_norm": 0.3045582175254822,
	"learning_rate": 0.00046424702058504874,
	"loss": 1.4936,
	"step": 76
	},
	{
	"epoch": 0.08245214830678624,
	"grad_norm": 0.3408415913581848,
	"learning_rate": 0.0004637053087757313,
	"loss": 1.526,
	"step": 77
	},
	{
	"epoch": 0.08352295542765359,
	"grad_norm": 0.3176616430282593,
	"learning_rate": 0.00046316359696641385,
	"loss": 1.5581,
	"step": 78
	},
	{
	"epoch": 0.08459376254852095,
	"grad_norm": 0.3179102838039398,
	"learning_rate": 0.0004626218851570964,
	"loss": 1.5525,
	"step": 79
	},
	{
	"epoch": 0.0856645696693883,
	"grad_norm": 0.3425735831260681,
	"learning_rate": 0.00046208017334777897,
	"loss": 1.4914,
	"step": 80
	},
	{
	"epoch": 0.08673537679025566,
	"grad_norm": 0.36185234785079956,
	"learning_rate": 0.0004615384615384616,
	"loss": 1.5293,
	"step": 81
	},
	{
	"epoch": 0.087806183911123,
	"grad_norm": 0.3470607399940491,
	"learning_rate": 0.00046099674972914414,
	"loss": 1.5388,
	"step": 82
	},
	{
	"epoch": 0.08887699103199036,
	"grad_norm": 0.3171769976615906,
	"learning_rate": 0.0004604550379198267,
	"loss": 1.4932,
	"step": 83
	},
	{
	"epoch": 0.08994779815285772,
	"grad_norm": 0.3396613895893097,
	"learning_rate": 0.00045991332611050926,
	"loss": 1.5367,
	"step": 84
	},
	{
	"epoch": 0.09101860527372507,
	"grad_norm": 0.3147753179073334,
	"learning_rate": 0.0004593716143011918,
	"loss": 1.5413,
	"step": 85
	},
	{
	"epoch": 0.09208941239459242,
	"grad_norm": 0.3213801383972168,
	"learning_rate": 0.0004588299024918743,
	"loss": 1.4544,
	"step": 86
	},
	{
	"epoch": 0.09316021951545977,
	"grad_norm": 0.3900924623012543,
	"learning_rate": 0.0004582881906825569,
	"loss": 1.5155,
	"step": 87
	},
	{
	"epoch": 0.09423102663632713,
	"grad_norm": 0.34930315613746643,
	"learning_rate": 0.00045774647887323943,
	"loss": 1.5323,
	"step": 88
	},
	{
	"epoch": 0.09530183375719449,
	"grad_norm": 0.32511013746261597,
	"learning_rate": 0.000457204767063922,
	"loss": 1.484,
	"step": 89
	},
	{
	"epoch": 0.09637264087806184,
	"grad_norm": 0.3209106922149658,
	"learning_rate": 0.00045666305525460455,
	"loss": 1.4659,
	"step": 90
	},
	{
	"epoch": 0.09744344799892919,
	"grad_norm": 0.3438887298107147,
	"learning_rate": 0.0004561213434452871,
	"loss": 1.522,
	"step": 91
	},
	{
	"epoch": 0.09851425511979654,
	"grad_norm": 0.5644230842590332,
	"learning_rate": 0.00045557963163596967,
	"loss": 1.5703,
	"step": 92
	},
	{
	"epoch": 0.0995850622406639,
	"grad_norm": 0.35866114497184753,
	"learning_rate": 0.0004550379198266522,
	"loss": 1.5637,
	"step": 93
	},
	{
	"epoch": 0.10065586936153126,
	"grad_norm": 0.3141271770000458,
	"learning_rate": 0.0004544962080173348,
	"loss": 1.5275,
	"step": 94
	},
	{
	"epoch": 0.10172667648239861,
	"grad_norm": 0.3229062557220459,
	"learning_rate": 0.00045395449620801734,
	"loss": 1.509,
	"step": 95
	},
	{
	"epoch": 0.10279748360326596,
	"grad_norm": 0.3184738755226135,
	"learning_rate": 0.0004534127843986999,
	"loss": 1.5243,
	"step": 96
	},
	{
	"epoch": 0.10386829072413331,
	"grad_norm": 0.33315855264663696,
	"learning_rate": 0.00045287107258938246,
	"loss": 1.4969,
	"step": 97
	},
	{
	"epoch": 0.10493909784500068,
	"grad_norm": 0.37624651193618774,
	"learning_rate": 0.000452329360780065,
	"loss": 1.5713,
	"step": 98
	},
	{
	"epoch": 0.10600990496586803,
	"grad_norm": 0.3466942608356476,
	"learning_rate": 0.0004517876489707476,
	"loss": 1.4497,
	"step": 99
	},
	{
	"epoch": 0.10708071208673538,
	"grad_norm": 0.3428940773010254,
	"learning_rate": 0.00045124593716143013,
	"loss": 1.5272,
	"step": 100
	},
	{
	"epoch": 0.10815151920760273,
	"grad_norm": 0.32997605204582214,
	"learning_rate": 0.0004507042253521127,
	"loss": 1.5664,
	"step": 101
	},
	{
	"epoch": 0.10922232632847008,
	"grad_norm": 0.35048359632492065,
	"learning_rate": 0.00045016251354279525,
	"loss": 1.4883,
	"step": 102
	},
	{
	"epoch": 0.11029313344933744,
	"grad_norm": 0.3379492461681366,
	"learning_rate": 0.0004496208017334778,
	"loss": 1.4706,
	"step": 103
	},
	{
	"epoch": 0.1113639405702048,
	"grad_norm": 0.36966028809547424,
	"learning_rate": 0.00044907908992416036,
	"loss": 1.5116,
	"step": 104
	},
	{
	"epoch": 0.11243474769107215,
	"grad_norm": 0.3487953245639801,
	"learning_rate": 0.0004485373781148429,
	"loss": 1.5147,
	"step": 105
	},
	{
	"epoch": 0.1135055548119395,
	"grad_norm": 0.3422049582004547,
	"learning_rate": 0.0004479956663055255,
	"loss": 1.4782,
	"step": 106
	},
	{
	"epoch": 0.11457636193280685,
	"grad_norm": 0.3196428716182709,
	"learning_rate": 0.00044745395449620804,
	"loss": 1.4375,
	"step": 107
	},
	{
	"epoch": 0.11564716905367421,
	"grad_norm": 0.3369114398956299,
	"learning_rate": 0.00044691224268689054,
	"loss": 1.5261,
	"step": 108
	},
	{
	"epoch": 0.11671797617454156,
	"grad_norm": 0.35993748903274536,
	"learning_rate": 0.0004463705308775731,
	"loss": 1.5136,
	"step": 109
	},
	{
	"epoch": 0.11778878329540891,
	"grad_norm": 0.3427882790565491,
	"learning_rate": 0.00044582881906825566,
	"loss": 1.5352,
	"step": 110
	},
	{
	"epoch": 0.11885959041627626,
	"grad_norm": 0.3308979570865631,
	"learning_rate": 0.0004452871072589382,
	"loss": 1.4979,
	"step": 111
	},
	{
	"epoch": 0.11993039753714362,
	"grad_norm": 0.3407396376132965,
	"learning_rate": 0.00044474539544962083,
	"loss": 1.5055,
	"step": 112
	},
	{
	"epoch": 0.12100120465801098,
	"grad_norm": 0.34919309616088867,
	"learning_rate": 0.0004442036836403034,
	"loss": 1.5032,
	"step": 113
	},
	{
	"epoch": 0.12207201177887833,
	"grad_norm": 0.34088361263275146,
	"learning_rate": 0.00044366197183098594,
	"loss": 1.5489,
	"step": 114
	},
	{
	"epoch": 0.12314281889974568,
	"grad_norm": 0.3275073766708374,
	"learning_rate": 0.0004431202600216685,
	"loss": 1.4882,
	"step": 115
	},
	{
	"epoch": 0.12421362602061303,
	"grad_norm": 0.35690388083457947,
	"learning_rate": 0.00044257854821235106,
	"loss": 1.4762,
	"step": 116
	},
	{
	"epoch": 0.12528443314148038,
	"grad_norm": 0.668167233467102,
	"learning_rate": 0.0004420368364030336,
	"loss": 1.5231,
	"step": 117
	},
	{
	"epoch": 0.12635524026234773,
	"grad_norm": 0.3807876408100128,
	"learning_rate": 0.0004414951245937162,
	"loss": 1.5125,
	"step": 118
	},
	{
	"epoch": 0.12742604738321509,
	"grad_norm": 0.32847508788108826,
	"learning_rate": 0.00044095341278439874,
	"loss": 1.4791,
	"step": 119
	},
	{
	"epoch": 0.12849685450408246,
	"grad_norm": 0.34058675169944763,
	"learning_rate": 0.00044041170097508124,
	"loss": 1.4917,
	"step": 120
	},
	{
	"epoch": 0.12956766162494981,
	"grad_norm": 0.3316013216972351,
	"learning_rate": 0.0004398699891657638,
	"loss": 1.5397,
	"step": 121
	},
	{
	"epoch": 0.13063846874581717,
	"grad_norm": 0.32970407605171204,
	"learning_rate": 0.00043932827735644636,
	"loss": 1.56,
	"step": 122
	},
	{
	"epoch": 0.13170927586668452,
	"grad_norm": 0.3216981887817383,
	"learning_rate": 0.0004387865655471289,
	"loss": 1.4856,
	"step": 123
	},
	{
	"epoch": 0.13278008298755187,
	"grad_norm": 0.3492419421672821,
	"learning_rate": 0.00043824485373781147,
	"loss": 1.4941,
	"step": 124
	},
	{
	"epoch": 0.13385089010841922,
	"grad_norm": 0.3463359475135803,
	"learning_rate": 0.00043770314192849403,
	"loss": 1.5003,
	"step": 125
	},
	{
	"epoch": 0.13492169722928657,
	"grad_norm": 0.3727024793624878,
	"learning_rate": 0.0004371614301191766,
	"loss": 1.4981,
	"step": 126
	},
	{
	"epoch": 0.13599250435015392,
	"grad_norm": 0.5523554086685181,
	"learning_rate": 0.00043661971830985915,
	"loss": 1.5786,
	"step": 127
	},
	{
	"epoch": 0.13706331147102127,
	"grad_norm": 0.32683220505714417,
	"learning_rate": 0.00043607800650054176,
	"loss": 1.4902,
	"step": 128
	},
	{
	"epoch": 0.13813411859188865,
	"grad_norm": 0.3415539562702179,
	"learning_rate": 0.0004355362946912243,
	"loss": 1.4875,
	"step": 129
	},
	{
	"epoch": 0.139204925712756,
	"grad_norm": 0.3191353976726532,
	"learning_rate": 0.0004349945828819069,
	"loss": 1.4759,
	"step": 130
	},
	{
	"epoch": 0.14027573283362335,
	"grad_norm": 0.35508468747138977,
	"learning_rate": 0.00043445287107258943,
	"loss": 1.5611,
	"step": 131
	},
	{
	"epoch": 0.1413465399544907,
	"grad_norm": 0.33212971687316895,
	"learning_rate": 0.00043391115926327194,
	"loss": 1.4522,
	"step": 132
	},
	{
	"epoch": 0.14241734707535805,
	"grad_norm": 0.3219762146472931,
	"learning_rate": 0.0004333694474539545,
	"loss": 1.4582,
	"step": 133
	},
	{
	"epoch": 0.1434881541962254,
	"grad_norm": 0.36882877349853516,
	"learning_rate": 0.00043282773564463705,
	"loss": 1.5347,
	"step": 134
	},
	{
	"epoch": 0.14455896131709275,
	"grad_norm": 0.33573803305625916,
	"learning_rate": 0.0004322860238353196,
	"loss": 1.4876,
	"step": 135
	},
	{
	"epoch": 0.1456297684379601,
	"grad_norm": 0.33557966351509094,
	"learning_rate": 0.00043174431202600217,
	"loss": 1.4536,
	"step": 136
	},
	{
	"epoch": 0.14670057555882746,
	"grad_norm": 0.3364240527153015,
	"learning_rate": 0.0004312026002166847,
	"loss": 1.5241,
	"step": 137
	},
	{
	"epoch": 0.1477713826796948,
	"grad_norm": 0.31000298261642456,
	"learning_rate": 0.0004306608884073673,
	"loss": 1.4427,
	"step": 138
	},
	{
	"epoch": 0.14884218980056219,
	"grad_norm": 0.31178000569343567,
	"learning_rate": 0.00043011917659804984,
	"loss": 1.5455,
	"step": 139
	},
	{
	"epoch": 0.14991299692142954,
	"grad_norm": 0.3283156752586365,
	"learning_rate": 0.0004295774647887324,
	"loss": 1.5277,
	"step": 140
	},
	{
	"epoch": 0.1509838040422969,
	"grad_norm": 0.34077680110931396,
	"learning_rate": 0.00042903575297941496,
	"loss": 1.5203,
	"step": 141
	},
	{
	"epoch": 0.15205461116316424,
	"grad_norm": 0.3414633870124817,
	"learning_rate": 0.0004284940411700975,
	"loss": 1.5143,
	"step": 142
	},
	{
	"epoch": 0.1531254182840316,
	"grad_norm": 0.3262156844139099,
	"learning_rate": 0.0004279523293607801,
	"loss": 1.492,
	"step": 143
	},
	{
	"epoch": 0.15419622540489894,
	"grad_norm": 0.3537783920764923,
	"learning_rate": 0.00042741061755146263,
	"loss": 1.5223,
	"step": 144
	},
	{
	"epoch": 0.1552670325257663,
	"grad_norm": 0.339911550283432,
	"learning_rate": 0.0004268689057421452,
	"loss": 1.5162,
	"step": 145
	},
	{
	"epoch": 0.15633783964663364,
	"grad_norm": 0.36946552991867065,
	"learning_rate": 0.00042632719393282775,
	"loss": 1.4668,
	"step": 146
	},
	{
	"epoch": 0.157408646767501,
	"grad_norm": 0.33070170879364014,
	"learning_rate": 0.0004257854821235103,
	"loss": 1.4606,
	"step": 147
	},
	{
	"epoch": 0.15847945388836837,
	"grad_norm": 0.33413979411125183,
	"learning_rate": 0.00042524377031419287,
	"loss": 1.5032,
	"step": 148
	},
	{
	"epoch": 0.15955026100923572,
	"grad_norm": 0.3402380049228668,
	"learning_rate": 0.0004247020585048754,
	"loss": 1.52,
	"step": 149
	},
	{
	"epoch": 0.16062106813010307,
	"grad_norm": 0.3602783679962158,
	"learning_rate": 0.000424160346695558,
	"loss": 1.5349,
	"step": 150
	},
	{
	"epoch": 0.16169187525097042,
	"grad_norm": 0.32968804240226746,
	"learning_rate": 0.00042361863488624054,
	"loss": 1.4369,
	"step": 151
	},
	{
	"epoch": 0.16276268237183777,
	"grad_norm": 0.3444564938545227,
	"learning_rate": 0.0004230769230769231,
	"loss": 1.4565,
	"step": 152
	},
	{
	"epoch": 0.16383348949270513,
	"grad_norm": 0.37572184205055237,
	"learning_rate": 0.00042253521126760566,
	"loss": 1.4921,
	"step": 153
	},
	{
	"epoch": 0.16490429661357248,
	"grad_norm": 0.3675267994403839,
	"learning_rate": 0.0004219934994582882,
	"loss": 1.5345,
	"step": 154
	},
	{
	"epoch": 0.16597510373443983,
	"grad_norm": 0.34972381591796875,
	"learning_rate": 0.0004214517876489707,
	"loss": 1.4759,
	"step": 155
	},
	{
	"epoch": 0.16704591085530718,
	"grad_norm": 0.35719773173332214,
	"learning_rate": 0.0004209100758396533,
	"loss": 1.5401,
	"step": 156
	},
	{
	"epoch": 0.16811671797617453,
	"grad_norm": 0.3391767144203186,
	"learning_rate": 0.00042036836403033583,
	"loss": 1.5129,
	"step": 157
	},
	{
	"epoch": 0.1691875250970419,
	"grad_norm": 0.34171062707901,
	"learning_rate": 0.0004198266522210184,
	"loss": 1.5304,
	"step": 158
	},
	{
	"epoch": 0.17025833221790926,
	"grad_norm": 0.3329889476299286,
	"learning_rate": 0.000419284940411701,
	"loss": 1.4794,
	"step": 159
	},
	{
	"epoch": 0.1713291393387766,
	"grad_norm": 0.329875111579895,
	"learning_rate": 0.00041874322860238356,
	"loss": 1.4658,
	"step": 160
	},
	{
	"epoch": 0.17239994645964396,
	"grad_norm": 0.36654773354530334,
	"learning_rate": 0.0004182015167930661,
	"loss": 1.5079,
	"step": 161
	},
	{
	"epoch": 0.1734707535805113,
	"grad_norm": 0.3587745130062103,
	"learning_rate": 0.0004176598049837487,
	"loss": 1.4352,
	"step": 162
	},
	{
	"epoch": 0.17454156070137866,
	"grad_norm": 0.32216113805770874,
	"learning_rate": 0.00041711809317443124,
	"loss": 1.4214,
	"step": 163
	},
	{
	"epoch": 0.175612367822246,
	"grad_norm": 0.34425267577171326,
	"learning_rate": 0.0004165763813651138,
	"loss": 1.5408,
	"step": 164
	},
	{
	"epoch": 0.17668317494311336,
	"grad_norm": 0.34980979561805725,
	"learning_rate": 0.00041603466955579635,
	"loss": 1.4995,
	"step": 165
	},
	{
	"epoch": 0.17775398206398071,
	"grad_norm": 0.33706167340278625,
	"learning_rate": 0.00041549295774647886,
	"loss": 1.4966,
	"step": 166
	},
	{
	"epoch": 0.1788247891848481,
	"grad_norm": 0.3577290177345276,
	"learning_rate": 0.0004149512459371614,
	"loss": 1.5051,
	"step": 167
	},
	{
	"epoch": 0.17989559630571544,
	"grad_norm": 0.33480167388916016,
	"learning_rate": 0.000414409534127844,
	"loss": 1.4846,
	"step": 168
	},
	{
	"epoch": 0.1809664034265828,
	"grad_norm": 0.3389778137207031,
	"learning_rate": 0.00041386782231852653,
	"loss": 1.4659,
	"step": 169
	},
	{
	"epoch": 0.18203721054745015,
	"grad_norm": 0.34035906195640564,
	"learning_rate": 0.0004133261105092091,
	"loss": 1.5269,
	"step": 170
	},
	{
	"epoch": 0.1831080176683175,
	"grad_norm": 0.33953285217285156,
	"learning_rate": 0.00041278439869989165,
	"loss": 1.5608,
	"step": 171
	},
	{
	"epoch": 0.18417882478918485,
	"grad_norm": 0.331253319978714,
	"learning_rate": 0.0004122426868905742,
	"loss": 1.4238,
	"step": 172
	},
	{
	"epoch": 0.1852496319100522,
	"grad_norm": 0.3417370915412903,
	"learning_rate": 0.00041170097508125676,
	"loss": 1.5335,
	"step": 173
	},
	{
	"epoch": 0.18632043903091955,
	"grad_norm": 0.3459537923336029,
	"learning_rate": 0.0004111592632719393,
	"loss": 1.5405,
	"step": 174
	},
	{
	"epoch": 0.1873912461517869,
	"grad_norm": 0.34250974655151367,
	"learning_rate": 0.00041061755146262193,
	"loss": 1.5451,
	"step": 175
	},
	{
	"epoch": 0.18846205327265425,
	"grad_norm": 0.35121142864227295,
	"learning_rate": 0.0004100758396533045,
	"loss": 1.4584,
	"step": 176
	},
	{
	"epoch": 0.18953286039352163,
	"grad_norm": 0.3343502879142761,
	"learning_rate": 0.00040953412784398705,
	"loss": 1.4967,
	"step": 177
	},
	{
	"epoch": 0.19060366751438898,
	"grad_norm": 0.3440572917461395,
	"learning_rate": 0.00040899241603466955,
	"loss": 1.5322,
	"step": 178
	},
	{
	"epoch": 0.19167447463525633,
	"grad_norm": 0.3478721082210541,
	"learning_rate": 0.0004084507042253521,
	"loss": 1.4887,
	"step": 179
	},
	{
	"epoch": 0.19274528175612368,
	"grad_norm": 0.3297663927078247,
	"learning_rate": 0.00040790899241603467,
	"loss": 1.4321,
	"step": 180
	},
	{
	"epoch": 0.19381608887699103,
	"grad_norm": 0.3527899384498596,
	"learning_rate": 0.00040736728060671723,
	"loss": 1.5411,
	"step": 181
	},
	{
	"epoch": 0.19488689599785838,
	"grad_norm": 0.3361954987049103,
	"learning_rate": 0.0004068255687973998,
	"loss": 1.4383,
	"step": 182
	},
	{
	"epoch": 0.19595770311872573,
	"grad_norm": 0.35988926887512207,
	"learning_rate": 0.00040628385698808235,
	"loss": 1.4807,
	"step": 183
	},
	{
	"epoch": 0.19702851023959309,
	"grad_norm": 0.35412025451660156,
	"learning_rate": 0.0004057421451787649,
	"loss": 1.5432,
	"step": 184
	},
	{
	"epoch": 0.19809931736046044,
	"grad_norm": 0.3374565541744232,
	"learning_rate": 0.00040520043336944746,
	"loss": 1.4895,
	"step": 185
	},
	{
	"epoch": 0.1991701244813278,
	"grad_norm": 0.35347357392311096,
	"learning_rate": 0.00040465872156013,
	"loss": 1.4761,
	"step": 186
	},
	{
	"epoch": 0.20024093160219517,
	"grad_norm": 0.34612298011779785,
	"learning_rate": 0.0004041170097508126,
	"loss": 1.4867,
	"step": 187
	},
	{
	"epoch": 0.20131173872306252,
	"grad_norm": 0.36123159527778625,
	"learning_rate": 0.00040357529794149514,
	"loss": 1.4753,
	"step": 188
	},
	{
	"epoch": 0.20238254584392987,
	"grad_norm": 0.37735962867736816,
	"learning_rate": 0.00040303358613217764,
	"loss": 1.5158,
	"step": 189
	},
	{
	"epoch": 0.20345335296479722,
	"grad_norm": 0.365067720413208,
	"learning_rate": 0.00040249187432286025,
	"loss": 1.5493,
	"step": 190
	},
	{
	"epoch": 0.20452416008566457,
	"grad_norm": 0.33235374093055725,
	"learning_rate": 0.0004019501625135428,
	"loss": 1.495,
	"step": 191
	},
	{
	"epoch": 0.20559496720653192,
	"grad_norm": 0.35279738903045654,
	"learning_rate": 0.00040140845070422537,
	"loss": 1.4681,
	"step": 192
	},
	{
	"epoch": 0.20666577432739927,
	"grad_norm": 0.342896968126297,
	"learning_rate": 0.0004008667388949079,
	"loss": 1.5163,
	"step": 193
	},
	{
	"epoch": 0.20773658144826662,
	"grad_norm": 0.34132811427116394,
	"learning_rate": 0.0004003250270855905,
	"loss": 1.4822,
	"step": 194
	},
	{
	"epoch": 0.20880738856913397,
	"grad_norm": 0.34202563762664795,
	"learning_rate": 0.00039978331527627304,
	"loss": 1.44,
	"step": 195
	},
	{
	"epoch": 0.20987819569000135,
	"grad_norm": 0.3383086919784546,
	"learning_rate": 0.0003992416034669556,
	"loss": 1.4993,
	"step": 196
	},
	{
	"epoch": 0.2109490028108687,
	"grad_norm": 0.35314062237739563,
	"learning_rate": 0.00039869989165763816,
	"loss": 1.5139,
	"step": 197
	},
	{
	"epoch": 0.21201980993173605,
	"grad_norm": 0.3365531265735626,
	"learning_rate": 0.0003981581798483207,
	"loss": 1.429,
	"step": 198
	},
	{
	"epoch": 0.2130906170526034,
	"grad_norm": 0.33675894141197205,
	"learning_rate": 0.0003976164680390033,
	"loss": 1.4568,
	"step": 199
	},
	{
	"epoch": 0.21416142417347075,
	"grad_norm": 0.340620219707489,
	"learning_rate": 0.00039707475622968583,
	"loss": 1.4935,
	"step": 200
	}
	],
	"logging_steps": 1,
	"max_steps": 933,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 5.528726582329344e+17,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}